mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Start work on porting to beautifulsoup4
This commit is contained in:
parent
78b7112012
commit
692230147c
@ -267,7 +267,7 @@ to go to https://www.nytimes.com/pages/todayspaper/index.html and fetch the list
|
||||
of articles that appear in *todays* paper. While more complex than simply using
|
||||
:term:`RSS`, the recipe creates an e-book that corresponds very closely to the
|
||||
days paper. ``parse_index`` makes heavy use of `BeautifulSoup
|
||||
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ to parse
|
||||
<https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_ to parse
|
||||
the daily paper webpage. You can also use other, more modern parsers if you
|
||||
dislike BeautifulSoup. calibre comes with `lxml <https://lxml.de/>`_ and
|
||||
`html5lib <https://github.com/html5lib/html5lib-python>`_, which are the
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -9,7 +9,6 @@ __docformat__ = "restructuredtext en"
|
||||
|
||||
import os, time, traceback, re, sys, io
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
from contextlib import nested, closing
|
||||
|
||||
|
||||
@ -17,7 +16,6 @@ from calibre import (browser, __appname__, iswindows, force_unicode,
|
||||
strftime, preferred_encoding, as_unicode, random_user_agent)
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.web import Recipe
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
@ -224,14 +222,14 @@ class BasicNewsRecipe(Recipe):
|
||||
#:
|
||||
#: {
|
||||
#: name : 'tag name', #e.g. 'div'
|
||||
#: attrs : a dictionary, #e.g. {class: 'advertisment'}
|
||||
#: attrs : a dictionary, #e.g. {'class': 'advertisment'}
|
||||
#: }
|
||||
#:
|
||||
#: All keys are optional. For a full explanation of the search criteria, see
|
||||
#: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html#Searching%20the%20Parse%20Tree>`_
|
||||
#: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree>`_
|
||||
#: A common example::
|
||||
#:
|
||||
#: remove_tags = [dict(name='div', attrs={'class':'advert'})]
|
||||
#: remove_tags = [dict(name='div', class_='advert')]
|
||||
#:
|
||||
#: This will remove all `<div class="advert">` tags and all
|
||||
#: their children from the downloaded :term:`HTML`.
|
||||
@ -662,7 +660,7 @@ class BasicNewsRecipe(Recipe):
|
||||
def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
|
||||
'''
|
||||
Convenience method that takes an URL to the index page and returns
|
||||
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
|
||||
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc>`_
|
||||
of it.
|
||||
|
||||
`url_or_raw`: Either a URL or the downloaded index page as a string
|
||||
@ -701,8 +699,7 @@ class BasicNewsRecipe(Recipe):
|
||||
from html5_parser import parse
|
||||
return parse(_raw)
|
||||
else:
|
||||
from html5_parser.soup import set_soup_module, parse
|
||||
set_soup_module(sys.modules[BeautifulSoup.__module__])
|
||||
return BeautifulSoup(_raw)
|
||||
return parse(_raw, return_root=False)
|
||||
|
||||
def extract_readable_article(self, html, url):
|
||||
@ -951,7 +948,7 @@ class BasicNewsRecipe(Recipe):
|
||||
def _postprocess_html(self, soup, first_fetch, job_info):
|
||||
if self.no_stylesheets:
|
||||
for link in soup.findAll('link'):
|
||||
if (link.get('type') or 'text/css').lower() == 'text/css' and (link.get('rel') or 'stylesheet').lower() == 'stylesheet':
|
||||
if (link.get('type') or 'text/css').lower() == 'text/css' and 'stylesheet' in (link.get('rel') or ('stylesheet',)):
|
||||
link.extract()
|
||||
for style in soup.findAll('style'):
|
||||
style.extract()
|
||||
@ -960,9 +957,10 @@ class BasicNewsRecipe(Recipe):
|
||||
head = soup.find('body')
|
||||
if not head:
|
||||
head = soup.find(True)
|
||||
style = BeautifulSoup(u'<style type="text/css" title="override_css">%s</style>'%(
|
||||
self.template_css +'\n\n'+(self.get_extra_css() or ''))).find('style')
|
||||
head.insert(len(head.contents), style)
|
||||
css = self.template_css + '\n\n' + (self.get_extra_css() or '')
|
||||
style = soup.new_tag('style', type='text/css', title='override_css')
|
||||
style.append(css)
|
||||
head.append(style)
|
||||
if first_fetch and job_info:
|
||||
url, f, a, feed_len = job_info
|
||||
body = soup.find('body')
|
||||
@ -1648,14 +1646,14 @@ class BasicNewsRecipe(Recipe):
|
||||
def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
|
||||
'''
|
||||
Convenience method to take a
|
||||
`BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
|
||||
`BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
|
||||
`Tag` and extract the text from it recursively, including any CDATA sections
|
||||
and alt tag attributes. Return a possibly empty unicode string.
|
||||
|
||||
`use_alt`: If `True` try to use the alt attribute for tags that don't
|
||||
have any textual content
|
||||
|
||||
`tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
|
||||
`tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
|
||||
`Tag`
|
||||
'''
|
||||
if tag is None:
|
||||
@ -1686,11 +1684,7 @@ class BasicNewsRecipe(Recipe):
|
||||
|
||||
@classmethod
|
||||
def soup(cls, raw):
|
||||
entity_replace = [(re.compile(u'&(\\S+?);'), partial(entity_to_unicode,
|
||||
exceptions=[]))]
|
||||
nmassage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(entity_replace)
|
||||
return BeautifulSoup(raw, markupMassage=nmassage)
|
||||
return BeautifulSoup(raw)
|
||||
|
||||
@classmethod
|
||||
def adeify_images(cls, soup):
|
||||
@ -1708,8 +1702,8 @@ class BasicNewsRecipe(Recipe):
|
||||
oldParent = item.parent
|
||||
myIndex = oldParent.contents.index(item)
|
||||
item.extract()
|
||||
divtag = Tag(soup,'div')
|
||||
brtag = Tag(soup,'br')
|
||||
divtag = soup.new_tag('div')
|
||||
brtag = soup.new_tag('br')
|
||||
oldParent.insert(myIndex,divtag)
|
||||
divtag.append(item)
|
||||
divtag.append(brtag)
|
||||
|
@ -20,11 +20,9 @@ import traceback
|
||||
from base64 import b64decode
|
||||
from httplib import responses
|
||||
|
||||
from html5_parser.soup import parse, set_soup_module
|
||||
|
||||
from calibre import browser, relpath, unicode_path
|
||||
from calibre.constants import filesystem_encoding, iswindows
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
@ -82,17 +80,14 @@ def basename(url):
|
||||
|
||||
|
||||
def save_soup(soup, target):
|
||||
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||
nm = ns.find('meta')
|
||||
metas = soup.findAll('meta', content=True)
|
||||
added = False
|
||||
for meta in metas:
|
||||
if 'charset' in meta.get('content', '').lower():
|
||||
meta.replaceWith(nm)
|
||||
added = True
|
||||
if not added:
|
||||
for meta in soup.findAll('meta', content=True):
|
||||
if 'charset' in meta['content'].lower():
|
||||
meta.extract()
|
||||
for meta in soup.findAll('meta', charset=True):
|
||||
meta.extract()
|
||||
head = soup.find('head')
|
||||
if head is not None:
|
||||
nm = soup.new_tag('meta', charset='utf-8')
|
||||
head.insert(0, nm)
|
||||
|
||||
selfdir = os.path.dirname(target)
|
||||
@ -191,18 +186,17 @@ class RecursiveFetcher(object):
|
||||
usrc = self.preprocess_raw_html(usrc, url)
|
||||
for pat, repl in nmassage:
|
||||
usrc = pat.sub(repl, usrc)
|
||||
set_soup_module(sys.modules[BeautifulSoup.__module__])
|
||||
soup = parse(usrc, return_root=False)
|
||||
soup = BeautifulSoup(usrc)
|
||||
|
||||
replace = self.prepreprocess_html_ext(soup)
|
||||
if replace is not None:
|
||||
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
|
||||
for pat, repl in nmassage:
|
||||
replace = pat.sub(repl, replace)
|
||||
soup = parse(replace, return_root=False)
|
||||
soup = BeautifulSoup(replace)
|
||||
|
||||
if self.keep_only_tags:
|
||||
body = Tag(soup, 'body')
|
||||
body = soup.new_tag('body')
|
||||
try:
|
||||
if isinstance(self.keep_only_tags, dict):
|
||||
self.keep_only_tags = [self.keep_only_tags]
|
||||
@ -334,13 +328,19 @@ class RecursiveFetcher(object):
|
||||
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
|
||||
if not os.path.exists(diskpath):
|
||||
os.mkdir(diskpath)
|
||||
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')): # noqa
|
||||
if tag.has_key('href'): # noqa
|
||||
for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
|
||||
try:
|
||||
mtype = tag['type']
|
||||
except KeyError:
|
||||
mtype = 'text/css' if tag.name.lower() == 'style' else ''
|
||||
if mtype.lower() != 'text/css':
|
||||
continue
|
||||
if tag.has_attr('href'):
|
||||
iurl = tag['href']
|
||||
if not urlsplit(iurl).scheme:
|
||||
iurl = urljoin(baseurl, iurl, False)
|
||||
with self.stylemap_lock:
|
||||
if self.stylemap.has_key(iurl): # noqa
|
||||
if iurl in self.stylemap:
|
||||
tag['href'] = self.stylemap[iurl]
|
||||
continue
|
||||
try:
|
||||
@ -363,7 +363,7 @@ class RecursiveFetcher(object):
|
||||
if not urlsplit(iurl).scheme:
|
||||
iurl = urljoin(baseurl, iurl, False)
|
||||
with self.stylemap_lock:
|
||||
if self.stylemap.has_key(iurl): # noqa
|
||||
if iurl in self.stylemap:
|
||||
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
||||
continue
|
||||
try:
|
||||
@ -387,7 +387,7 @@ class RecursiveFetcher(object):
|
||||
if not os.path.exists(diskpath):
|
||||
os.mkdir(diskpath)
|
||||
c = 0
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): # noqa
|
||||
for tag in soup.findAll('img', src=True):
|
||||
iurl = tag['src']
|
||||
if iurl.startswith('data:image/'):
|
||||
try:
|
||||
@ -401,7 +401,7 @@ class RecursiveFetcher(object):
|
||||
if not urlsplit(iurl).scheme:
|
||||
iurl = urljoin(baseurl, iurl, False)
|
||||
with self.imagemap_lock:
|
||||
if self.imagemap.has_key(iurl): # noqa
|
||||
if iurl in self.imagemap:
|
||||
tag['src'] = self.imagemap[iurl]
|
||||
continue
|
||||
try:
|
||||
@ -479,12 +479,12 @@ class RecursiveFetcher(object):
|
||||
tag[key] = path+suffix
|
||||
|
||||
def process_return_links(self, soup, baseurl):
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): # noqa
|
||||
for tag in soup.findAll('a', href=True):
|
||||
iurl = self.absurl(baseurl, tag, 'href')
|
||||
if not iurl:
|
||||
continue
|
||||
nurl = self.normurl(iurl)
|
||||
if self.filemap.has_key(nurl): # noqa
|
||||
if nurl in self.filemap:
|
||||
self.localize_link(tag, 'href', self.filemap[nurl])
|
||||
|
||||
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
||||
@ -506,7 +506,7 @@ class RecursiveFetcher(object):
|
||||
if not iurl:
|
||||
continue
|
||||
nurl = self.normurl(iurl)
|
||||
if self.filemap.has_key(nurl): # noqa
|
||||
if nurl in self.filemap:
|
||||
self.localize_link(tag, 'href', self.filemap[nurl])
|
||||
continue
|
||||
if self.files > self.max_files:
|
||||
|
Loading…
x
Reference in New Issue
Block a user