py3: More fixes for news downloads

This commit is contained in:
Kovid Goyal 2019-04-23 16:04:20 +05:30
parent bfbc31fa9f
commit c12c80e174
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 2 additions and 3 deletions

View File

@ -8,7 +8,6 @@ from collections import defaultdict
from polyglot.builtins import reraise, unicode_type
from lxml.etree import tostring
from lxml.html import (fragment_fromstring, document_fromstring,
tostring as htostring)
@ -315,7 +314,7 @@ class Document:
def transform_misused_divs_into_paragraphs(self):
for elem in self.tags(self.html, 'div'):
# transform <div>s that do not contain other block elements into <p>s
if not REGEXES['divToPElementsRe'].search(unicode_type(''.join(map(tostring, list(elem))))):
if not REGEXES['divToPElementsRe'].search(unicode_type(''.join(map(tounicode, list(elem))))):
# self.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
# print "Fixed element "+describe(elem)

View File

@ -341,7 +341,7 @@ def feed_from_xml(raw_xml, title=None, oldest_article=7,
from calibre.web.feeds.feedparser import parse
# Handle unclosed escaped entities. They trip up feedparser and HBR for one
# generates them
raw_xml = re.sub(r'(&amp;#\d+)([^0-9;])', r'\1;\2', raw_xml)
raw_xml = re.sub(br'(&amp;#\d+)([^0-9;])', br'\1;\2', raw_xml)
feed = parse(raw_xml)
pfeed = Feed(get_article_url=get_article_url, log=log)
pfeed.populate_from_feed(feed, title=title,