Downloading more than 20 years of The New York Times

Articles for the period from 1987 to present are available without subscription. Their copyright notice is web scraping friendly:

“… you may download material from The New York Times on the Web (one machine readable copy and one print copy per page) for your personal, noncommercial use only.”

Why waste the opportunity to download these articles then?

fat-pope-y-tho.jpg

Please read their terms of service here.
Please subscribe to The New York Times here.

# -*- coding: utf-8 -*-
# Sergei Bugrov
# 7-10-17
#
# Downloads all available articles from https://www.nytimes.com
#
# usage : python nytimes.py
#
# python version : 3.6.1
import requests, bs4, os, errno, time, datetime, re
def download_page(url):
try:
page = requests.get(url, timeout=10.0)
except requests.exceptions.Timeout:
print('Timeout\n')
return None
except requests.exceptions.ConnectionError:
print('ConnectionError\n')
time.sleep(120)
return None
except requests.exceptions.HTTPError:
print('HTTPError\n')
return None
except requests.exceptions.TooManyRedirects:
print('TooManyRedirects\n')
return None
else:
return page
def main():
max_attempts = 10
r_unwanted = re.compile('[\n\t\r]')
urls_to_articles = []
if not os.path.exists('articles/'):
try:
os.makedirs('articles/')
except OSError as e:
if e.errno != errno.EEXIST:
raise
# STEP 1. BUILD THE LIST OF URLS TO ARTICLES
if not os.path.exists('urls_to_articles.txt'):
for year in range(1987, datetime.datetime.now().year + 1):
catalog_page_by_years = 'http://spiderbites.nytimes.com/free_%s/index.html' % (year)
links_to_parts = []
attempts = 0
print('Year: ', year)
with open('logfile.log', 'w') as f:
f.write('STEP 1. Year: ' + str(year) + '\n')
catalog_page = download_page(catalog_page_by_years)
while not (catalog_page or attempts > max_attempts):
catalog_page = download_page(catalog_page_by_years)
attempts += 1
if catalog_page:
catalog_page = bs4.BeautifulSoup(catalog_page.text, "lxml")
if year > 1995:
links_to_parts.append(['http://spiderbites.nytimes.com%s' % (el.get('href')) for el in catalog_page.select('body > div > div > div > div > div > div > ul > li > a')])
else:
links_to_parts.append(['http://spiderbites.nytimes.com/free_%s/%s' % (year, el.get('href')) for el in catalog_page.select('body > div > div > div > div > div > div > ul > li > a')])
links_to_parts = [item for sublist in links_to_parts for item in sublist]
for link_to_parts in links_to_parts:
attempts = 0
parts_page = download_page(link_to_parts)
while not (parts_page or attempts > max_attempts):
parts_page = download_page(link_to_parts)
attempts += 1
if parts_page:
parts_page = bs4.BeautifulSoup(parts_page.text, "lxml")
urls_to_articles.append([el.get('href') for el in parts_page.select('body > div > div > div > div > ul > li > a')])
urls_to_articles = [item for sublist in urls_to_articles for item in sublist]
# Backing up the list of URLs
with open('urls_to_articles.txt', 'w') as output:
for u in urls_to_articles:
output.write('%s\n' % (u.strip()))
# STEP 2. DOWNLOAD ARTICLES
# If, at some point, Step 2 is interrupted due to unforeseen
# circumstances (power outage, loss of internet connection), replace the number
# (value of the variable url_num) below with the one you will find in the logfile.log
url_num = 0
if os.path.exists('urls_to_articles.txt') and len(urls_to_articles) == 0:
with open('urls_to_articles.txt', 'r') as f:
urls_to_articles = f.read().splitlines()
print('Number of articles that are about to be downloaded: ', len(urls_to_articles))
for url in urls_to_articles[url_num:]:
if len(url) > 34:
attempts = 0
if url_num % 1000 == 0:
print('Downloading article #', url_num, ' from ', url)
with open('logfile.log', 'w') as f:
f.write('STEP 2. Downloading article #' + str(url_num) + ' from ' + url + '\n')
article_page = download_page(url)
while not (article_page or attempts > max_attempts):
article_page = download_page(url)
attempts += 1
if article_page:
article_page = bs4.BeautifulSoup(article_page.text, "lxml")
title = [el.getText() for el in article_page.find_all(class_="articleHeadline")]
if len(title) > 0:
title = title[0]
else:
title = [el.getText() for el in article_page.find_all(class_="headline")]
if len(title) > 0:
title = title[0]
else:
title = ""
dateline = [el.getText() for el in article_page.find_all(class_="dateline")]
if len(dateline) > 0:
dateline = dateline[0]
else:
dateline = ""
byline = [el.getText().strip() for el in article_page.find_all(class_="byline")]
if len(byline) > 0:
byline = ' '.join(byline)
else:
byline = ""
body = [el.getText() for el in article_page.find_all(class_="articleBody")]
if len(body) > 0:
body = '\n'.join(body)
body = r_unwanted.sub("", body)
body = re.sub(' +', ' ', body)
with open('articles/' + str(url_num) + url.split('/')[-1] + '.txt', 'w') as output:
output.write('(c) ' + str(datetime.datetime.now().year) + ' The New York Times Company\n')
output.write(url + '\n')
output.write(title + '\n')
output.write(dateline + '\n')
output.write(byline + '\n')
output.write('\n' + body)
else:
body = [el.getText() for el in article_page.find_all(class_="story-body-text")]
if len(body) > 0:
body = '\n'.join(body)
body = r_unwanted.sub("", body)
body = re.sub(' +', ' ', body)
with open('articles/' + str(url_num) + url.split('/')[-1] + '.txt', 'w') as output:
output.write('(c) ' + str(datetime.datetime.now().year) + ' The New York Times Company\n')
output.write(url + '\n')
output.write(title + '\n')
output.write(dateline + '\n')
output.write(byline + '\n')
output.write('\n' + body)
url_num += 1
if __name__ == '__main__':
"""
The main function is called when nytimes.py is run from the command line
"""
main()
view raw nytimes.py hosted with ❤ by GitHub

Next time, I’ll modify the code so you can download articles from some other major online newspaper.

3 thoughts on “Downloading more than 20 years of The New York Times

  1. With a few small edits, you could add the ability to download comments/commenter info. It seems that although the community api has depreciated, it still works fine.

    Like

    • Although the article search API won’t download the full text it might be useful in order to find abstracts and keywords for the articles. My point is that API is a great idea if you want to get some additional data.

      Like

Leave a comment