from urllib.parse import urlparse
|
|
from readabilipy import simple_json_from_html_string
|
|
import requests
|
|
|
|
def clean_articles(rows):
|
|
#article_id, url, title, byline
|
|
out = []
|
|
|
|
for row in rows:
|
|
parsed_uri = urlparse(row[1])
|
|
result = '{uri.netloc}'.format(uri=parsed_uri)
|
|
out.append([row[0], row[1], row[2], row[3], result])
|
|
|
|
return out
|
|
|
|
|
|
def get_article(url):
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
|
|
response = requests.get(url, headers=headers)
|
|
|
|
article = simple_json_from_html_string(response.text, use_readability=True)
|
|
parsed_uri = urlparse(url)
|
|
result = '{uri.netloc}'.format(uri=parsed_uri)
|
|
|
|
return article, result
|