mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Start work on porting to beautifulsoup4
This commit is contained in:
parent
78b7112012
commit
692230147c
@ -267,7 +267,7 @@ to go to https://www.nytimes.com/pages/todayspaper/index.html and fetch the list
|
|||||||
of articles that appear in *todays* paper. While more complex than simply using
|
of articles that appear in *todays* paper. While more complex than simply using
|
||||||
:term:`RSS`, the recipe creates an e-book that corresponds very closely to the
|
:term:`RSS`, the recipe creates an e-book that corresponds very closely to the
|
||||||
days paper. ``parse_index`` makes heavy use of `BeautifulSoup
|
days paper. ``parse_index`` makes heavy use of `BeautifulSoup
|
||||||
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ to parse
|
<https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_ to parse
|
||||||
the daily paper webpage. You can also use other, more modern parsers if you
|
the daily paper webpage. You can also use other, more modern parsers if you
|
||||||
dislike BeautifulSoup. calibre comes with `lxml <https://lxml.de/>`_ and
|
dislike BeautifulSoup. calibre comes with `lxml <https://lxml.de/>`_ and
|
||||||
`html5lib <https://github.com/html5lib/html5lib-python>`_, which are the
|
`html5lib <https://github.com/html5lib/html5lib-python>`_, which are the
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -9,7 +9,6 @@ __docformat__ = "restructuredtext en"
|
|||||||
|
|
||||||
import os, time, traceback, re, sys, io
|
import os, time, traceback, re, sys, io
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from functools import partial
|
|
||||||
from contextlib import nested, closing
|
from contextlib import nested, closing
|
||||||
|
|
||||||
|
|
||||||
@ -17,7 +16,6 @@ from calibre import (browser, __appname__, iswindows, force_unicode,
|
|||||||
strftime, preferred_encoding, as_unicode, random_user_agent)
|
strftime, preferred_encoding, as_unicode, random_user_agent)
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre import entity_to_unicode
|
|
||||||
from calibre.web import Recipe
|
from calibre.web import Recipe
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
@ -224,14 +222,14 @@ class BasicNewsRecipe(Recipe):
|
|||||||
#:
|
#:
|
||||||
#: {
|
#: {
|
||||||
#: name : 'tag name', #e.g. 'div'
|
#: name : 'tag name', #e.g. 'div'
|
||||||
#: attrs : a dictionary, #e.g. {class: 'advertisment'}
|
#: attrs : a dictionary, #e.g. {'class': 'advertisment'}
|
||||||
#: }
|
#: }
|
||||||
#:
|
#:
|
||||||
#: All keys are optional. For a full explanation of the search criteria, see
|
#: All keys are optional. For a full explanation of the search criteria, see
|
||||||
#: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html#Searching%20the%20Parse%20Tree>`_
|
#: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree>`_
|
||||||
#: A common example::
|
#: A common example::
|
||||||
#:
|
#:
|
||||||
#: remove_tags = [dict(name='div', attrs={'class':'advert'})]
|
#: remove_tags = [dict(name='div', class_='advert')]
|
||||||
#:
|
#:
|
||||||
#: This will remove all `<div class="advert">` tags and all
|
#: This will remove all `<div class="advert">` tags and all
|
||||||
#: their children from the downloaded :term:`HTML`.
|
#: their children from the downloaded :term:`HTML`.
|
||||||
@ -662,7 +660,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
|
def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
|
||||||
'''
|
'''
|
||||||
Convenience method that takes an URL to the index page and returns
|
Convenience method that takes an URL to the index page and returns
|
||||||
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
|
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc>`_
|
||||||
of it.
|
of it.
|
||||||
|
|
||||||
`url_or_raw`: Either a URL or the downloaded index page as a string
|
`url_or_raw`: Either a URL or the downloaded index page as a string
|
||||||
@ -701,8 +699,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
return parse(_raw)
|
return parse(_raw)
|
||||||
else:
|
else:
|
||||||
from html5_parser.soup import set_soup_module, parse
|
return BeautifulSoup(_raw)
|
||||||
set_soup_module(sys.modules[BeautifulSoup.__module__])
|
|
||||||
return parse(_raw, return_root=False)
|
return parse(_raw, return_root=False)
|
||||||
|
|
||||||
def extract_readable_article(self, html, url):
|
def extract_readable_article(self, html, url):
|
||||||
@ -951,7 +948,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
def _postprocess_html(self, soup, first_fetch, job_info):
|
def _postprocess_html(self, soup, first_fetch, job_info):
|
||||||
if self.no_stylesheets:
|
if self.no_stylesheets:
|
||||||
for link in soup.findAll('link'):
|
for link in soup.findAll('link'):
|
||||||
if (link.get('type') or 'text/css').lower() == 'text/css' and (link.get('rel') or 'stylesheet').lower() == 'stylesheet':
|
if (link.get('type') or 'text/css').lower() == 'text/css' and 'stylesheet' in (link.get('rel') or ('stylesheet',)):
|
||||||
link.extract()
|
link.extract()
|
||||||
for style in soup.findAll('style'):
|
for style in soup.findAll('style'):
|
||||||
style.extract()
|
style.extract()
|
||||||
@ -960,9 +957,10 @@ class BasicNewsRecipe(Recipe):
|
|||||||
head = soup.find('body')
|
head = soup.find('body')
|
||||||
if not head:
|
if not head:
|
||||||
head = soup.find(True)
|
head = soup.find(True)
|
||||||
style = BeautifulSoup(u'<style type="text/css" title="override_css">%s</style>'%(
|
css = self.template_css + '\n\n' + (self.get_extra_css() or '')
|
||||||
self.template_css +'\n\n'+(self.get_extra_css() or ''))).find('style')
|
style = soup.new_tag('style', type='text/css', title='override_css')
|
||||||
head.insert(len(head.contents), style)
|
style.append(css)
|
||||||
|
head.append(style)
|
||||||
if first_fetch and job_info:
|
if first_fetch and job_info:
|
||||||
url, f, a, feed_len = job_info
|
url, f, a, feed_len = job_info
|
||||||
body = soup.find('body')
|
body = soup.find('body')
|
||||||
@ -1648,14 +1646,14 @@ class BasicNewsRecipe(Recipe):
|
|||||||
def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
|
def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
|
||||||
'''
|
'''
|
||||||
Convenience method to take a
|
Convenience method to take a
|
||||||
`BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
|
`BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
|
||||||
`Tag` and extract the text from it recursively, including any CDATA sections
|
`Tag` and extract the text from it recursively, including any CDATA sections
|
||||||
and alt tag attributes. Return a possibly empty unicode string.
|
and alt tag attributes. Return a possibly empty unicode string.
|
||||||
|
|
||||||
`use_alt`: If `True` try to use the alt attribute for tags that don't
|
`use_alt`: If `True` try to use the alt attribute for tags that don't
|
||||||
have any textual content
|
have any textual content
|
||||||
|
|
||||||
`tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
|
`tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
|
||||||
`Tag`
|
`Tag`
|
||||||
'''
|
'''
|
||||||
if tag is None:
|
if tag is None:
|
||||||
@ -1686,11 +1684,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def soup(cls, raw):
|
def soup(cls, raw):
|
||||||
entity_replace = [(re.compile(u'&(\\S+?);'), partial(entity_to_unicode,
|
return BeautifulSoup(raw)
|
||||||
exceptions=[]))]
|
|
||||||
nmassage = list(BeautifulSoup.MARKUP_MASSAGE)
|
|
||||||
nmassage.extend(entity_replace)
|
|
||||||
return BeautifulSoup(raw, markupMassage=nmassage)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def adeify_images(cls, soup):
|
def adeify_images(cls, soup):
|
||||||
@ -1708,8 +1702,8 @@ class BasicNewsRecipe(Recipe):
|
|||||||
oldParent = item.parent
|
oldParent = item.parent
|
||||||
myIndex = oldParent.contents.index(item)
|
myIndex = oldParent.contents.index(item)
|
||||||
item.extract()
|
item.extract()
|
||||||
divtag = Tag(soup,'div')
|
divtag = soup.new_tag('div')
|
||||||
brtag = Tag(soup,'br')
|
brtag = soup.new_tag('br')
|
||||||
oldParent.insert(myIndex,divtag)
|
oldParent.insert(myIndex,divtag)
|
||||||
divtag.append(item)
|
divtag.append(item)
|
||||||
divtag.append(brtag)
|
divtag.append(brtag)
|
||||||
|
@ -20,11 +20,9 @@ import traceback
|
|||||||
from base64 import b64decode
|
from base64 import b64decode
|
||||||
from httplib import responses
|
from httplib import responses
|
||||||
|
|
||||||
from html5_parser.soup import parse, set_soup_module
|
|
||||||
|
|
||||||
from calibre import browser, relpath, unicode_path
|
from calibre import browser, relpath, unicode_path
|
||||||
from calibre.constants import filesystem_encoding, iswindows
|
from calibre.constants import filesystem_encoding, iswindows
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.utils.config import OptionParser
|
from calibre.utils.config import OptionParser
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
@ -82,18 +80,15 @@ def basename(url):
|
|||||||
|
|
||||||
|
|
||||||
def save_soup(soup, target):
|
def save_soup(soup, target):
|
||||||
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
for meta in soup.findAll('meta', content=True):
|
||||||
nm = ns.find('meta')
|
if 'charset' in meta['content'].lower():
|
||||||
metas = soup.findAll('meta', content=True)
|
meta.extract()
|
||||||
added = False
|
for meta in soup.findAll('meta', charset=True):
|
||||||
for meta in metas:
|
meta.extract()
|
||||||
if 'charset' in meta.get('content', '').lower():
|
head = soup.find('head')
|
||||||
meta.replaceWith(nm)
|
if head is not None:
|
||||||
added = True
|
nm = soup.new_tag('meta', charset='utf-8')
|
||||||
if not added:
|
head.insert(0, nm)
|
||||||
head = soup.find('head')
|
|
||||||
if head is not None:
|
|
||||||
head.insert(0, nm)
|
|
||||||
|
|
||||||
selfdir = os.path.dirname(target)
|
selfdir = os.path.dirname(target)
|
||||||
|
|
||||||
@ -191,18 +186,17 @@ class RecursiveFetcher(object):
|
|||||||
usrc = self.preprocess_raw_html(usrc, url)
|
usrc = self.preprocess_raw_html(usrc, url)
|
||||||
for pat, repl in nmassage:
|
for pat, repl in nmassage:
|
||||||
usrc = pat.sub(repl, usrc)
|
usrc = pat.sub(repl, usrc)
|
||||||
set_soup_module(sys.modules[BeautifulSoup.__module__])
|
soup = BeautifulSoup(usrc)
|
||||||
soup = parse(usrc, return_root=False)
|
|
||||||
|
|
||||||
replace = self.prepreprocess_html_ext(soup)
|
replace = self.prepreprocess_html_ext(soup)
|
||||||
if replace is not None:
|
if replace is not None:
|
||||||
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
|
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
|
||||||
for pat, repl in nmassage:
|
for pat, repl in nmassage:
|
||||||
replace = pat.sub(repl, replace)
|
replace = pat.sub(repl, replace)
|
||||||
soup = parse(replace, return_root=False)
|
soup = BeautifulSoup(replace)
|
||||||
|
|
||||||
if self.keep_only_tags:
|
if self.keep_only_tags:
|
||||||
body = Tag(soup, 'body')
|
body = soup.new_tag('body')
|
||||||
try:
|
try:
|
||||||
if isinstance(self.keep_only_tags, dict):
|
if isinstance(self.keep_only_tags, dict):
|
||||||
self.keep_only_tags = [self.keep_only_tags]
|
self.keep_only_tags = [self.keep_only_tags]
|
||||||
@ -334,13 +328,19 @@ class RecursiveFetcher(object):
|
|||||||
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
|
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
|
||||||
if not os.path.exists(diskpath):
|
if not os.path.exists(diskpath):
|
||||||
os.mkdir(diskpath)
|
os.mkdir(diskpath)
|
||||||
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')): # noqa
|
for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
|
||||||
if tag.has_key('href'): # noqa
|
try:
|
||||||
|
mtype = tag['type']
|
||||||
|
except KeyError:
|
||||||
|
mtype = 'text/css' if tag.name.lower() == 'style' else ''
|
||||||
|
if mtype.lower() != 'text/css':
|
||||||
|
continue
|
||||||
|
if tag.has_attr('href'):
|
||||||
iurl = tag['href']
|
iurl = tag['href']
|
||||||
if not urlsplit(iurl).scheme:
|
if not urlsplit(iurl).scheme:
|
||||||
iurl = urljoin(baseurl, iurl, False)
|
iurl = urljoin(baseurl, iurl, False)
|
||||||
with self.stylemap_lock:
|
with self.stylemap_lock:
|
||||||
if self.stylemap.has_key(iurl): # noqa
|
if iurl in self.stylemap:
|
||||||
tag['href'] = self.stylemap[iurl]
|
tag['href'] = self.stylemap[iurl]
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
@ -363,7 +363,7 @@ class RecursiveFetcher(object):
|
|||||||
if not urlsplit(iurl).scheme:
|
if not urlsplit(iurl).scheme:
|
||||||
iurl = urljoin(baseurl, iurl, False)
|
iurl = urljoin(baseurl, iurl, False)
|
||||||
with self.stylemap_lock:
|
with self.stylemap_lock:
|
||||||
if self.stylemap.has_key(iurl): # noqa
|
if iurl in self.stylemap:
|
||||||
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
@ -387,7 +387,7 @@ class RecursiveFetcher(object):
|
|||||||
if not os.path.exists(diskpath):
|
if not os.path.exists(diskpath):
|
||||||
os.mkdir(diskpath)
|
os.mkdir(diskpath)
|
||||||
c = 0
|
c = 0
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): # noqa
|
for tag in soup.findAll('img', src=True):
|
||||||
iurl = tag['src']
|
iurl = tag['src']
|
||||||
if iurl.startswith('data:image/'):
|
if iurl.startswith('data:image/'):
|
||||||
try:
|
try:
|
||||||
@ -401,7 +401,7 @@ class RecursiveFetcher(object):
|
|||||||
if not urlsplit(iurl).scheme:
|
if not urlsplit(iurl).scheme:
|
||||||
iurl = urljoin(baseurl, iurl, False)
|
iurl = urljoin(baseurl, iurl, False)
|
||||||
with self.imagemap_lock:
|
with self.imagemap_lock:
|
||||||
if self.imagemap.has_key(iurl): # noqa
|
if iurl in self.imagemap:
|
||||||
tag['src'] = self.imagemap[iurl]
|
tag['src'] = self.imagemap[iurl]
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
@ -479,12 +479,12 @@ class RecursiveFetcher(object):
|
|||||||
tag[key] = path+suffix
|
tag[key] = path+suffix
|
||||||
|
|
||||||
def process_return_links(self, soup, baseurl):
|
def process_return_links(self, soup, baseurl):
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): # noqa
|
for tag in soup.findAll('a', href=True):
|
||||||
iurl = self.absurl(baseurl, tag, 'href')
|
iurl = self.absurl(baseurl, tag, 'href')
|
||||||
if not iurl:
|
if not iurl:
|
||||||
continue
|
continue
|
||||||
nurl = self.normurl(iurl)
|
nurl = self.normurl(iurl)
|
||||||
if self.filemap.has_key(nurl): # noqa
|
if nurl in self.filemap:
|
||||||
self.localize_link(tag, 'href', self.filemap[nurl])
|
self.localize_link(tag, 'href', self.filemap[nurl])
|
||||||
|
|
||||||
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
||||||
@ -506,7 +506,7 @@ class RecursiveFetcher(object):
|
|||||||
if not iurl:
|
if not iurl:
|
||||||
continue
|
continue
|
||||||
nurl = self.normurl(iurl)
|
nurl = self.normurl(iurl)
|
||||||
if self.filemap.has_key(nurl): # noqa
|
if nurl in self.filemap:
|
||||||
self.localize_link(tag, 'href', self.filemap[nurl])
|
self.localize_link(tag, 'href', self.filemap[nurl])
|
||||||
continue
|
continue
|
||||||
if self.files > self.max_files:
|
if self.files > self.max_files:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user