mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
py3: More fixes for news downloads
This commit is contained in:
parent
bfbc31fa9f
commit
c12c80e174
@ -8,7 +8,6 @@ from collections import defaultdict
|
||||
|
||||
from polyglot.builtins import reraise, unicode_type
|
||||
|
||||
from lxml.etree import tostring
|
||||
from lxml.html import (fragment_fromstring, document_fromstring,
|
||||
tostring as htostring)
|
||||
|
||||
@ -315,7 +314,7 @@ class Document:
|
||||
def transform_misused_divs_into_paragraphs(self):
|
||||
for elem in self.tags(self.html, 'div'):
|
||||
# transform <div>s that do not contain other block elements into <p>s
|
||||
if not REGEXES['divToPElementsRe'].search(unicode_type(''.join(map(tostring, list(elem))))):
|
||||
if not REGEXES['divToPElementsRe'].search(unicode_type(''.join(map(tounicode, list(elem))))):
|
||||
# self.debug("Altering %s to p" % (describe(elem)))
|
||||
elem.tag = "p"
|
||||
# print "Fixed element "+describe(elem)
|
||||
|
@ -341,7 +341,7 @@ def feed_from_xml(raw_xml, title=None, oldest_article=7,
|
||||
from calibre.web.feeds.feedparser import parse
|
||||
# Handle unclosed escaped entities. They trip up feedparser and HBR for one
|
||||
# generates them
|
||||
raw_xml = re.sub(r'(&#\d+)([^0-9;])', r'\1;\2', raw_xml)
|
||||
raw_xml = re.sub(br'(&#\d+)([^0-9;])', br'\1;\2', raw_xml)
|
||||
feed = parse(raw_xml)
|
||||
pfeed = Feed(get_article_url=get_article_url, log=log)
|
||||
pfeed.populate_from_feed(feed, title=title,
|
||||
|
Loading…
x
Reference in New Issue
Block a user