mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use wayback for nytimes pages
This commit is contained in:
parent
1b53ec7fdd
commit
692fa6d4fc
@ -28,6 +28,19 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
|
||||
no_javascript = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
encoding = 'utf-8'
|
||||
articles_are_obfuscated = True
|
||||
delay = 1
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
if not hasattr(self, 'nyt_parser'):
|
||||
from calibre.live import load_module
|
||||
m = load_module('calibre.web.site_parsers.nytimes')
|
||||
self.nyt_parser = m
|
||||
raw = self.nyt_parser.download_url(url, self.cloned_browser)
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
with PersistentTemporaryFile(suffix='.html') as pt:
|
||||
pt.write(raw)
|
||||
return pt.name
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
if not hasattr(self, 'nyt_parser'):
|
||||
|
@ -9,6 +9,9 @@ from xml.sax.saxutils import escape, quoteattr
|
||||
from calibre.utils.iso8601 import parse_iso8601
|
||||
|
||||
|
||||
module_version = 1 # needed for live updates
|
||||
|
||||
|
||||
def is_heading(tn):
|
||||
return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
|
||||
|
||||
@ -116,3 +119,18 @@ def extract_html(soup):
|
||||
script = type(u'')(script)
|
||||
raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
|
||||
return json_to_html(raw)
|
||||
|
||||
|
||||
def download_url(url, br):
|
||||
# NYT has implemented captcha protection for its article pages, so get
|
||||
# them from the wayback machine instead. However, wayback machine is
|
||||
# flaky so god knows how well it will work under load
|
||||
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||
m = search_engines_module()
|
||||
cu = m.wayback_machine_cached_url(url, br)
|
||||
raw = m.get_data_for_cached_url(cu)
|
||||
if raw is None:
|
||||
raw = br.open_novisit(cu).read()
|
||||
if not isinstance(raw, bytes):
|
||||
raw = raw.encode('utf-8')
|
||||
return raw
|
||||
|
Loading…
x
Reference in New Issue
Block a user