Articles for the period from 1987 to present are available without subscription. Their copyright notice is web scraping friendly:
“… you may download material from The New York Times on the Web (one machine readable copy and one print copy per page) for your personal, noncommercial use only.”
Why waste the opportunity to download these articles then?
Please read their terms of service here.
Please subscribe to The New York Times here.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Sergei Bugrov | |
# 7-10-17 | |
# | |
# Downloads all available articles from https://www.nytimes.com | |
# | |
# usage : python nytimes.py | |
# | |
# python version : 3.6.1 | |
import requests, bs4, os, errno, time, datetime, re | |
def download_page(url): | |
try: | |
page = requests.get(url, timeout=10.0) | |
except requests.exceptions.Timeout: | |
print('Timeout\n') | |
return None | |
except requests.exceptions.ConnectionError: | |
print('ConnectionError\n') | |
time.sleep(120) | |
return None | |
except requests.exceptions.HTTPError: | |
print('HTTPError\n') | |
return None | |
except requests.exceptions.TooManyRedirects: | |
print('TooManyRedirects\n') | |
return None | |
else: | |
return page | |
def main(): | |
max_attempts = 10 | |
r_unwanted = re.compile('[\n\t\r]') | |
urls_to_articles = [] | |
if not os.path.exists('articles/'): | |
try: | |
os.makedirs('articles/') | |
except OSError as e: | |
if e.errno != errno.EEXIST: | |
raise | |
# STEP 1. BUILD THE LIST OF URLS TO ARTICLES | |
if not os.path.exists('urls_to_articles.txt'): | |
for year in range(1987, datetime.datetime.now().year + 1): | |
catalog_page_by_years = 'http://spiderbites.nytimes.com/free_%s/index.html' % (year) | |
links_to_parts = [] | |
attempts = 0 | |
print('Year: ', year) | |
with open('logfile.log', 'w') as f: | |
f.write('STEP 1. Year: ' + str(year) + '\n') | |
catalog_page = download_page(catalog_page_by_years) | |
while not (catalog_page or attempts > max_attempts): | |
catalog_page = download_page(catalog_page_by_years) | |
attempts += 1 | |
if catalog_page: | |
catalog_page = bs4.BeautifulSoup(catalog_page.text, "lxml") | |
if year > 1995: | |
links_to_parts.append(['http://spiderbites.nytimes.com%s' % (el.get('href')) for el in catalog_page.select('body > div > div > div > div > div > div > ul > li > a')]) | |
else: | |
links_to_parts.append(['http://spiderbites.nytimes.com/free_%s/%s' % (year, el.get('href')) for el in catalog_page.select('body > div > div > div > div > div > div > ul > li > a')]) | |
links_to_parts = [item for sublist in links_to_parts for item in sublist] | |
for link_to_parts in links_to_parts: | |
attempts = 0 | |
parts_page = download_page(link_to_parts) | |
while not (parts_page or attempts > max_attempts): | |
parts_page = download_page(link_to_parts) | |
attempts += 1 | |
if parts_page: | |
parts_page = bs4.BeautifulSoup(parts_page.text, "lxml") | |
urls_to_articles.append([el.get('href') for el in parts_page.select('body > div > div > div > div > ul > li > a')]) | |
urls_to_articles = [item for sublist in urls_to_articles for item in sublist] | |
# Backing up the list of URLs | |
with open('urls_to_articles.txt', 'w') as output: | |
for u in urls_to_articles: | |
output.write('%s\n' % (u.strip())) | |
# STEP 2. DOWNLOAD ARTICLES | |
# If, at some point, Step 2 is interrupted due to unforeseen | |
# circumstances (power outage, loss of internet connection), replace the number | |
# (value of the variable url_num) below with the one you will find in the logfile.log | |
url_num = 0 | |
if os.path.exists('urls_to_articles.txt') and len(urls_to_articles) == 0: | |
with open('urls_to_articles.txt', 'r') as f: | |
urls_to_articles = f.read().splitlines() | |
print('Number of articles that are about to be downloaded: ', len(urls_to_articles)) | |
for url in urls_to_articles[url_num:]: | |
if len(url) > 34: | |
attempts = 0 | |
if url_num % 1000 == 0: | |
print('Downloading article #', url_num, ' from ', url) | |
with open('logfile.log', 'w') as f: | |
f.write('STEP 2. Downloading article #' + str(url_num) + ' from ' + url + '\n') | |
article_page = download_page(url) | |
while not (article_page or attempts > max_attempts): | |
article_page = download_page(url) | |
attempts += 1 | |
if article_page: | |
article_page = bs4.BeautifulSoup(article_page.text, "lxml") | |
title = [el.getText() for el in article_page.find_all(class_="articleHeadline")] | |
if len(title) > 0: | |
title = title[0] | |
else: | |
title = [el.getText() for el in article_page.find_all(class_="headline")] | |
if len(title) > 0: | |
title = title[0] | |
else: | |
title = "" | |
dateline = [el.getText() for el in article_page.find_all(class_="dateline")] | |
if len(dateline) > 0: | |
dateline = dateline[0] | |
else: | |
dateline = "" | |
byline = [el.getText().strip() for el in article_page.find_all(class_="byline")] | |
if len(byline) > 0: | |
byline = ' '.join(byline) | |
else: | |
byline = "" | |
body = [el.getText() for el in article_page.find_all(class_="articleBody")] | |
if len(body) > 0: | |
body = '\n'.join(body) | |
body = r_unwanted.sub("", body) | |
body = re.sub(' +', ' ', body) | |
with open('articles/' + str(url_num) + url.split('/')[-1] + '.txt', 'w') as output: | |
output.write('(c) ' + str(datetime.datetime.now().year) + ' The New York Times Company\n') | |
output.write(url + '\n') | |
output.write(title + '\n') | |
output.write(dateline + '\n') | |
output.write(byline + '\n') | |
output.write('\n' + body) | |
else: | |
body = [el.getText() for el in article_page.find_all(class_="story-body-text")] | |
if len(body) > 0: | |
body = '\n'.join(body) | |
body = r_unwanted.sub("", body) | |
body = re.sub(' +', ' ', body) | |
with open('articles/' + str(url_num) + url.split('/')[-1] + '.txt', 'w') as output: | |
output.write('(c) ' + str(datetime.datetime.now().year) + ' The New York Times Company\n') | |
output.write(url + '\n') | |
output.write(title + '\n') | |
output.write(dateline + '\n') | |
output.write(byline + '\n') | |
output.write('\n' + body) | |
url_num += 1 | |
if __name__ == '__main__': | |
""" | |
The main function is called when nytimes.py is run from the command line | |
""" | |
main() |
Next time, I’ll modify the code so you can download articles from some other major online newspaper.
I am unable to download these
LikeLike
With a few small edits, you could add the ability to download comments/commenter info. It seems that although the community api has depreciated, it still works fine.
LikeLike
Although the article search API won’t download the full text it might be useful in order to find abstracts and keywords for the articles. My point is that API is a great idea if you want to get some additional data.
LikeLike