|
|
- from urllib.parse import urlparse
- from readabilipy import simple_json_from_html_string
- import requests
-
- def clean_articles(rows):
- #article_id, url, title, byline
- out = []
-
- for row in rows:
- parsed_uri = urlparse(row[1])
- result = '{uri.netloc}'.format(uri=parsed_uri)
- out.append([row[0], row[1], row[2], row[3], result])
-
- return out
-
-
- def get_article(url):
- headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
- response = requests.get(url, headers=headers)
-
- article = simple_json_from_html_string(response.text, use_readability=True)
- parsed_uri = urlparse(url)
- result = '{uri.netloc}'.format(uri=parsed_uri)
-
- return article, result
|