mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use wayback for nytimes pages
This commit is contained in:
parent
1b53ec7fdd
commit
692fa6d4fc
@ -28,6 +28,19 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
|
|||||||
no_javascript = True
|
no_javascript = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
articles_are_obfuscated = True
|
||||||
|
delay = 1
|
||||||
|
|
||||||
|
def get_obfuscated_article(self, url):
|
||||||
|
if not hasattr(self, 'nyt_parser'):
|
||||||
|
from calibre.live import load_module
|
||||||
|
m = load_module('calibre.web.site_parsers.nytimes')
|
||||||
|
self.nyt_parser = m
|
||||||
|
raw = self.nyt_parser.download_url(url, self.cloned_browser)
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
with PersistentTemporaryFile(suffix='.html') as pt:
|
||||||
|
pt.write(raw)
|
||||||
|
return pt.name
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
if not hasattr(self, 'nyt_parser'):
|
if not hasattr(self, 'nyt_parser'):
|
||||||
|
@ -9,6 +9,9 @@ from xml.sax.saxutils import escape, quoteattr
|
|||||||
from calibre.utils.iso8601 import parse_iso8601
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
|
|
||||||
|
|
||||||
|
module_version = 1 # needed for live updates
|
||||||
|
|
||||||
|
|
||||||
def is_heading(tn):
|
def is_heading(tn):
|
||||||
return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
|
return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
|
||||||
|
|
||||||
@ -116,3 +119,18 @@ def extract_html(soup):
|
|||||||
script = type(u'')(script)
|
script = type(u'')(script)
|
||||||
raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
|
raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
|
||||||
return json_to_html(raw)
|
return json_to_html(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def download_url(url, br):
|
||||||
|
# NYT has implemented captcha protection for its article pages, so get
|
||||||
|
# them from the wayback machine instead. However, wayback machine is
|
||||||
|
# flaky so god knows how well it will work under load
|
||||||
|
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||||
|
m = search_engines_module()
|
||||||
|
cu = m.wayback_machine_cached_url(url, br)
|
||||||
|
raw = m.get_data_for_cached_url(cu)
|
||||||
|
if raw is None:
|
||||||
|
raw = br.open_novisit(cu).read()
|
||||||
|
if not isinstance(raw, bytes):
|
||||||
|
raw = raw.encode('utf-8')
|
||||||
|
return raw
|
||||||
|
Loading…
x
Reference in New Issue
Block a user