mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
py3: More fixes for news downloads
This commit is contained in:
parent
bfbc31fa9f
commit
c12c80e174
@ -8,7 +8,6 @@ from collections import defaultdict
|
|||||||
|
|
||||||
from polyglot.builtins import reraise, unicode_type
|
from polyglot.builtins import reraise, unicode_type
|
||||||
|
|
||||||
from lxml.etree import tostring
|
|
||||||
from lxml.html import (fragment_fromstring, document_fromstring,
|
from lxml.html import (fragment_fromstring, document_fromstring,
|
||||||
tostring as htostring)
|
tostring as htostring)
|
||||||
|
|
||||||
@ -315,7 +314,7 @@ class Document:
|
|||||||
def transform_misused_divs_into_paragraphs(self):
|
def transform_misused_divs_into_paragraphs(self):
|
||||||
for elem in self.tags(self.html, 'div'):
|
for elem in self.tags(self.html, 'div'):
|
||||||
# transform <div>s that do not contain other block elements into <p>s
|
# transform <div>s that do not contain other block elements into <p>s
|
||||||
if not REGEXES['divToPElementsRe'].search(unicode_type(''.join(map(tostring, list(elem))))):
|
if not REGEXES['divToPElementsRe'].search(unicode_type(''.join(map(tounicode, list(elem))))):
|
||||||
# self.debug("Altering %s to p" % (describe(elem)))
|
# self.debug("Altering %s to p" % (describe(elem)))
|
||||||
elem.tag = "p"
|
elem.tag = "p"
|
||||||
# print "Fixed element "+describe(elem)
|
# print "Fixed element "+describe(elem)
|
||||||
|
@ -341,7 +341,7 @@ def feed_from_xml(raw_xml, title=None, oldest_article=7,
|
|||||||
from calibre.web.feeds.feedparser import parse
|
from calibre.web.feeds.feedparser import parse
|
||||||
# Handle unclosed escaped entities. They trip up feedparser and HBR for one
|
# Handle unclosed escaped entities. They trip up feedparser and HBR for one
|
||||||
# generates them
|
# generates them
|
||||||
raw_xml = re.sub(r'(&#\d+)([^0-9;])', r'\1;\2', raw_xml)
|
raw_xml = re.sub(br'(&#\d+)([^0-9;])', br'\1;\2', raw_xml)
|
||||||
feed = parse(raw_xml)
|
feed = parse(raw_xml)
|
||||||
pfeed = Feed(get_article_url=get_article_url, log=log)
|
pfeed = Feed(get_article_url=get_article_url, log=log)
|
||||||
pfeed.populate_from_feed(feed, title=title,
|
pfeed.populate_from_feed(feed, title=title,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user