mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
update wapo
Fix order of HTML parsing in preprocess_raw_html
This commit is contained in:
parent
1d2f6fd03b
commit
7b551046d2
@ -94,11 +94,11 @@ class TheWashingtonPost(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, url):
|
def preprocess_raw_html(self, raw, url):
|
||||||
|
root = parse(raw)
|
||||||
if '/interactive/' in url:
|
if '/interactive/' in url:
|
||||||
return ('<html><body><article><h1>' + root.xpath('//h1')[0].text + '</h1><em>'
|
return ('<html><body><article><h1>' + root.xpath('//h1')[0].text + '</h1><em>'
|
||||||
'This article is supposed to be read in a browser.'
|
'This article is supposed to be read in a browser.'
|
||||||
'</em></article></body></html>')
|
'</em></article></body></html>')
|
||||||
root = parse(raw)
|
|
||||||
m = root.xpath('//script[@id="__NEXT_DATA__"]')
|
m = root.xpath('//script[@id="__NEXT_DATA__"]')
|
||||||
|
|
||||||
data = json.loads(m[0].text)
|
data = json.loads(m[0].text)
|
||||||
|
@ -81,11 +81,11 @@ class wapoprint(BasicNewsRecipe):
|
|||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, url):
|
def preprocess_raw_html(self, raw, url):
|
||||||
|
root = parse(raw)
|
||||||
if '/interactive/' in url:
|
if '/interactive/' in url:
|
||||||
return ('<html><body><article><h1>' + root.xpath('//h1')[0].text + '</h1><em>'
|
return ('<html><body><article><h1>' + root.xpath('//h1')[0].text + '</h1><em>'
|
||||||
'This article is supposed to be read in a browser.'
|
'This article is supposed to be read in a browser.'
|
||||||
'</em></article></body></html>')
|
'</em></article></body></html>')
|
||||||
root = parse(raw)
|
|
||||||
m = root.xpath('//script[@id="__NEXT_DATA__"]')
|
m = root.xpath('//script[@id="__NEXT_DATA__"]')
|
||||||
|
|
||||||
data = json.loads(m[0].text)
|
data = json.loads(m[0].text)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user