You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

24 lines
813 B

4 years ago
4 years ago
  1. from urllib.parse import urlparse
  2. from readabilipy import simple_json_from_html_string
  3. import requests
  4. def clean_articles(rows):
  5. #article_id, url, title, byline
  6. out = []
  7. for row in rows:
  8. parsed_uri = urlparse(row[1])
  9. result = '{uri.netloc}'.format(uri=parsed_uri)
  10. out.append([row[0], row[1], row[2], row[3], result])
  11. return out
  12. def get_article(url):
  13. headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
  14. response = requests.get(url, headers=headers)
  15. article = simple_json_from_html_string(response.text, use_readability=True)
  16. parsed_uri = urlparse(url)
  17. result = '{uri.netloc}'.format(uri=parsed_uri)
  18. return article, result