You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

25 lines
813 B

from urllib.parse import urlparse
from readabilipy import simple_json_from_html_string
import requests
def clean_articles(rows):
#article_id, url, title, byline
out = []
for row in rows:
parsed_uri = urlparse(row[1])
result = '{uri.netloc}'.format(uri=parsed_uri)
out.append([row[0], row[1], row[2], row[3], result])
return out
def get_article(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(url, headers=headers)
article = simple_json_from_html_string(response.text, use_readability=True)
parsed_uri = urlparse(url)
result = '{uri.netloc}'.format(uri=parsed_uri)
return article, result