from urllib.parse import urlparse from readabilipy import simple_json_from_html_string import requests def clean_articles(rows): #article_id, url, title, byline out = [] for row in rows: parsed_uri = urlparse(row[1]) result = '{uri.netloc}'.format(uri=parsed_uri) out.append([row[0], row[1], row[2], row[3], result]) return out def get_article(url): headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} response = requests.get(url, headers=headers) article = simple_json_from_html_string(response.text, use_readability=True) parsed_uri = urlparse(url) result = '{uri.netloc}'.format(uri=parsed_uri) return article, result