mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	Cleanup previous PR
This commit is contained in:
		
							parent
							
								
									4fd2a30f78
								
							
						
					
					
						commit
						26c10fd4d8
					
				@ -82,17 +82,6 @@ def new_tag(soup, name, attrs=()):
 | 
				
			|||||||
    return Tag(soup, name, attrs=attrs or None)
 | 
					    return Tag(soup, name, attrs=attrs or None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def clean_js_json(text):
 | 
					 | 
				
			||||||
    text = text.replace('undefined', 'null')
 | 
					 | 
				
			||||||
    text = re.sub(
 | 
					 | 
				
			||||||
        r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', 
 | 
					 | 
				
			||||||
        '',
 | 
					 | 
				
			||||||
        text,
 | 
					 | 
				
			||||||
        flags=re.DOTALL
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    return text
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class NewYorkTimes(BasicNewsRecipe):
 | 
					class NewYorkTimes(BasicNewsRecipe):
 | 
				
			||||||
    if is_web_edition:
 | 
					    if is_web_edition:
 | 
				
			||||||
        title = 'The New York Times (Web)'
 | 
					        title = 'The New York Times (Web)'
 | 
				
			||||||
@ -138,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe):
 | 
				
			|||||||
        return self.index_to_soup(url, raw=True)
 | 
					        return self.index_to_soup(url, raw=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def preprocess_raw_html(self, raw_html, url):
 | 
					    def preprocess_raw_html(self, raw_html, url):
 | 
				
			||||||
        cleaned = clean_js_json(raw_html)
 | 
					        cleaned = self.nyt_parser.clean_js_json(raw_html)
 | 
				
			||||||
        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
 | 
					        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    articles_are_obfuscated = use_wayback_machine
 | 
					    articles_are_obfuscated = use_wayback_machine
 | 
				
			||||||
@ -225,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe):
 | 
				
			|||||||
        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
 | 
					        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
 | 
				
			||||||
        script = type(u'')(script)
 | 
					        script = type(u'')(script)
 | 
				
			||||||
        raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
 | 
					        raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
 | 
				
			||||||
        clean_json = clean_js_json(raw_json)
 | 
					        clean_json = self.nyt_parser.clean_js_json(raw_json)
 | 
				
			||||||
        self.nytimes_graphql_config = json.loads(clean_json)['config']
 | 
					        self.nytimes_graphql_config = json.loads(clean_json)['config']
 | 
				
			||||||
        return soup
 | 
					        return soup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -82,19 +82,6 @@ def new_tag(soup, name, attrs=()):
 | 
				
			|||||||
    return Tag(soup, name, attrs=attrs or None)
 | 
					    return Tag(soup, name, attrs=attrs or None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def clean_js_json(text):
 | 
					 | 
				
			||||||
    text = text.replace('undefined', 'null')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # drop any JS function definitions
 | 
					 | 
				
			||||||
    text = re.sub(
 | 
					 | 
				
			||||||
        r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', 
 | 
					 | 
				
			||||||
        '',
 | 
					 | 
				
			||||||
        text,
 | 
					 | 
				
			||||||
        flags=re.DOTALL
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    return text
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class NewYorkTimes(BasicNewsRecipe):
 | 
					class NewYorkTimes(BasicNewsRecipe):
 | 
				
			||||||
    if is_web_edition:
 | 
					    if is_web_edition:
 | 
				
			||||||
        title = 'The New York Times (Web)'
 | 
					        title = 'The New York Times (Web)'
 | 
				
			||||||
@ -140,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe):
 | 
				
			|||||||
        return self.index_to_soup(url, raw=True)
 | 
					        return self.index_to_soup(url, raw=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def preprocess_raw_html(self, raw_html, url):
 | 
					    def preprocess_raw_html(self, raw_html, url):
 | 
				
			||||||
        cleaned = clean_js_json(raw_html)
 | 
					        cleaned = self.nyt_parser.clean_js_json(raw_html)
 | 
				
			||||||
        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
 | 
					        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    articles_are_obfuscated = use_wayback_machine
 | 
					    articles_are_obfuscated = use_wayback_machine
 | 
				
			||||||
@ -227,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe):
 | 
				
			|||||||
        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
 | 
					        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
 | 
				
			||||||
        script = type(u'')(script)
 | 
					        script = type(u'')(script)
 | 
				
			||||||
        raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
 | 
					        raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
 | 
				
			||||||
        clean_json = clean_js_json(raw_json)
 | 
					        clean_json = self.nyt_parser.clean_js_json(raw_json)
 | 
				
			||||||
        self.nytimes_graphql_config = json.loads(clean_json)['config']
 | 
					        self.nytimes_graphql_config = json.loads(clean_json)['config']
 | 
				
			||||||
        return soup
 | 
					        return soup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -99,7 +99,7 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
 | 
				
			|||||||
        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
 | 
					        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
 | 
				
			||||||
        script = type(u'')(script)
 | 
					        script = type(u'')(script)
 | 
				
			||||||
        json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
 | 
					        json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
 | 
				
			||||||
        self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
 | 
					        self.nytimes_graphql_config = json.loads(self.nyt_parser.clean_js_json(json_data))['config']
 | 
				
			||||||
        return soup
 | 
					        return soup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
 | 
					    def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
 | 
				
			||||||
 | 
				
			|||||||
@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from calibre.utils.iso8601 import parse_iso8601
 | 
					from calibre.utils.iso8601 import parse_iso8601
 | 
				
			||||||
 | 
					
 | 
				
			||||||
module_version = 11  # needed for live updates
 | 
					module_version = 12  # needed for live updates
 | 
				
			||||||
pprint
 | 
					pprint
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -195,8 +195,19 @@ def article_parse(data):
 | 
				
			|||||||
    yield '</body></html>'
 | 
					    yield '</body></html>'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def clean_js_json(text):
 | 
				
			||||||
 | 
					    text = re.sub(r'\bundefined\b', 'null', text)
 | 
				
			||||||
 | 
					    text = re.sub(
 | 
				
			||||||
 | 
					        r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}',
 | 
				
			||||||
 | 
					        '',
 | 
				
			||||||
 | 
					        text,
 | 
				
			||||||
 | 
					        flags=re.DOTALL
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    return text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def json_to_html(raw):
 | 
					def json_to_html(raw):
 | 
				
			||||||
    data = json.loads(raw.replace(':undefined', ':null'))
 | 
					    data = json.loads(clean_js_json(raw))
 | 
				
			||||||
    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
 | 
					    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        data = data['initialData']['data']
 | 
					        data = data['initialData']['data']
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user