diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index 485cf45245..591155ff85 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -27,7 +27,7 @@ class CGM(BasicNewsRecipe): del item['style'] ad=soup.findAll('a') for r in ad: - if 'http://www.hustla.pl' in r['href']: + if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']: r.extract() gallery=soup.find('div', attrs={'class':'galleryFlash'}) if gallery: diff --git a/recipes/folhadesaopaulo_sub.recipe b/recipes/folhadesaopaulo_sub.recipe index 31ffb2db66..32dd347405 100644 --- a/recipes/folhadesaopaulo_sub.recipe +++ b/recipes/folhadesaopaulo_sub.recipe @@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe): __author__ = 'fluzao' description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' - INDEX = 'http://www1.folha.uol.com.br/fsp/indices/' + + #found this to be the easiest place to find the index page (13-Nov-2011). + # searching for the "Indice Geral" link + HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/' + masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' + language = 'pt' no_stylesheets = True max_articles_per_feed = 40 remove_javascript = True needs_subscription = True - remove_tags_before = dict(name='b') + + remove_tags_before = dict(name='p') remove_tags = [dict(name='td', attrs={'align':'center'})] remove_attributes = ['height','width'] - masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' - # fixes the problem with the section names section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \ 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \ 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \ - 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'} + 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \ + 'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \ + 'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'} # this solves the problem with truncated content in Kindle conversion_options = {'linearize_tables' : True} # this bit removes the footer where there are links for Proximo Texto, Texto Anterior, # Indice e Comunicar Erros - preprocess_regexps = [(re.compile(r'

Texto Anterior:.*', - re.DOTALL|re.IGNORECASE), lambda match: r''), - (re.compile(r'

Próximo Texto:.*', + preprocess_regexps = [(re.compile(r'.*Comunicar Erros', re.DOTALL|re.IGNORECASE), lambda match: r'')] def get_browser(self): @@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe): def parse_index(self): - soup = self.index_to_soup(self.INDEX) + #Searching for the index page on the HOMEPAGE + hpsoup = self.index_to_soup(self.HOMEPAGE) + indexref = hpsoup.find('a', href=re.compile('^indices.*')) + self.log('--> tag containing the today s index: ', indexref) + INDEX = indexref['href'] + INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX + self.log('--> INDEX after extracting href and adding prefix: ', INDEX) + # ... and taking the opportunity to get the cover image link + coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href'] + if coverurl: + self.log('--> tag containing the today s cover: ', coverurl) + coverurl = coverurl.replace('htm', 'jpg') + coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl + self.log('--> coverurl after extracting href and adding prefix: ', coverurl) + self.cover_url = coverurl + + #soup = self.index_to_soup(self.INDEX) + soup = self.index_to_soup(INDEX) + feeds = [] articles = [] section_title = "Preambulo" @@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe): self.log('--> new section title: ', section_title) if strpost.startswith(' post: ', post) @@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe): # keeping the front page url minha_capa = feeds[0][1][1]['url'] - # removing the 'Preambulo' section + # removing the first section (now called 'top') del feeds[0] - # creating the url for the cover image - coverurl = feeds[0][1][0]['url'] - coverurl = coverurl.replace('/opiniao/fz', '/images/cp') - coverurl = coverurl.replace('01.htm', '.jpg') - self.cover_url = coverurl - # inserting the cover page as the first article (nicer for kindle users) feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}])) return feeds + + diff --git a/recipes/historia_pl.recipe b/recipes/historia_pl.recipe index 26cda733b2..34ca158a96 100644 --- a/recipes/historia_pl.recipe +++ b/recipes/historia_pl.recipe @@ -8,6 +8,15 @@ class Historia_org_pl(BasicNewsRecipe): category = 'history' language = 'pl' oldest_article = 8 + remove_empty_feeds=True max_articles_per_feed = 100 - feeds = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')] + feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'), + (u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'), + (u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'), + (u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'), + (u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'), + (u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'), + (u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'), + (u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'), + (u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')] diff --git a/recipes/icons/infra_pl.png b/recipes/icons/infra_pl.png new file mode 100644 index 0000000000..5607a7c983 Binary files /dev/null and b/recipes/icons/infra_pl.png differ diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe new file mode 100644 index 0000000000..0e035e0980 --- /dev/null +++ b/recipes/infra_pl.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class INFRA(BasicNewsRecipe): + title = u'INFRA' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.' + cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg' + category = 'UFO' + language = 'pl' + max_articles_per_feed = 100 + no_stylesheers=True + remove_tags_before=dict(name='h2', attrs={'class':'contentheading'}) + remove_tags_after=dict(attrs={'class':'pagenav'}) + remove_tags=[dict(attrs={'class':'pagenav'})] + feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')] diff --git a/recipes/spiders_web_pl.png b/recipes/spiders_web_pl.png new file mode 100644 index 0000000000..499dd19c8f Binary files /dev/null and b/recipes/spiders_web_pl.png differ diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe new file mode 100644 index 0000000000..d615f01aa9 --- /dev/null +++ b/recipes/spiders_web_pl.recipe @@ -0,0 +1,15 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class SpidersWeb(BasicNewsRecipe): + title = u"Spider's Web" + oldest_article = 7 + __author__ = 'fenuks' + description = u'' + cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' + category = 'IT, WEB' + language = 'pl' + max_articles_per_feed = 100 + remove_tags_before=dict(name="h1", attrs={'class':'Title'}) + remove_tags_after=dict(name="div", attrs={'class':'Text'}) + remove_tags=[dict(name='div', attrs={'class':['Tags', 'CommentCount FloatL', 'Show FloatL']})] + feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index af317d1b09..d06e32d9af 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -9,6 +9,7 @@ class Tablety_pl(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - keep_only_tags=[dict(name='header', attrs={'class':'entry-header'}), dict(name='div', attrs={'class':'entry-content clearfix'})] - remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'}), dict(name='span', attrs={'class':'dsq-postid'})] + remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) + remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) + remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index e083b38490..20ee9dde5d 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -106,7 +106,7 @@ class ANDROID(USBMS): 0x61c5 : [0x100, 0x226, 0x9999], 0x61cc : [0x100], 0x61ce : [0x100], - 0x618e : [0x226, 0x9999, 0x100] + 0x618e : [0x226, 0x227, 0x9999, 0x100] }, # Archos diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 819cd674fc..52dd109b47 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -12,7 +12,7 @@ from urllib import urlencode from threading import Thread from Queue import Queue, Empty -from lxml.html import soupparser, tostring +from lxml.html import tostring from calibre import as_unicode from calibre.ebooks.metadata import check_isbn @@ -23,6 +23,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.library.comments import sanitize_comments_html from calibre.utils.date import parse_date from calibre.utils.localization import canonicalize_lang +from calibre.utils.soupparser import fromstring class Worker(Thread): # Get details {{{ @@ -199,7 +200,7 @@ class Worker(Thread): # Get details {{{ return try: - root = soupparser.fromstring(clean_ascii_chars(raw)) + root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse amazon details page: %r'%self.url self.log.exception(msg) @@ -623,7 +624,7 @@ class Amazon(Source): if found: try: - root = soupparser.fromstring(clean_ascii_chars(raw)) + root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse amazon page for query: %r'%query log.exception(msg) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 2e63a2e267..1164567ff5 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -14,13 +14,13 @@ from threading import RLock from Queue import Queue, Empty from lxml import html -from lxml.html import soupparser from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html +from calibre.utils.soupparser import fromstring ovrdrv_data_cache = {} cache_lock = RLock() @@ -403,7 +403,7 @@ class OverDrive(Source): raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: - root = soupparser.fromstring(raw) + root = fromstring(raw) except: return False diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 4e3430b1dc..5d12018121 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -353,14 +353,14 @@ class MobiReader(object): self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): - from lxml.html import soupparser + from calibre.utils.soupparser import fromstring self.log.warning('Malformed markup, parsing using BeautifulSoup') try: - root = soupparser.fromstring(self.processed_html) + root = fromstring(self.processed_html) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) - root = soupparser.fromstring(self.processed_html) + root = fromstring(self.processed_html) if root.tag != 'html': self.log.warn('File does not have opening tag') diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 57720e22f2..0daf0d4e7a 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -894,8 +894,8 @@ class Manifest(object): except etree.XMLSyntaxError as err: self.oeb.logger.warn('Parsing file %r as HTML' % self.href) if err.args and err.args[0].startswith('Excessive depth'): - from lxml.html import soupparser - data = soupparser.fromstring(data) + from calibre.utils.soupparser import fromstring + data = fromstring(data) else: data = html.fromstring(data) data.attrib.pop('xmlns', None) diff --git a/src/calibre/gui2/comments_editor.py b/src/calibre/gui2/comments_editor.py index a594af739e..58ff55e95c 100644 --- a/src/calibre/gui2/comments_editor.py +++ b/src/calibre/gui2/comments_editor.py @@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en' import re, os from lxml import html -from lxml.html import soupparser from PyQt4.Qt import QApplication, QFontInfo, QSize, QWidget, QPlainTextEdit, \ QToolBar, QVBoxLayout, QAction, QIcon, Qt, QTabWidget, QUrl, \ @@ -19,6 +18,7 @@ from PyQt4.QtWebKit import QWebView, QWebPage from calibre.ebooks.chardet import xml_to_unicode from calibre import xml_replace_entities from calibre.gui2 import open_url +from calibre.utils.soupparser import fromstring class PageAction(QAction): # {{{ @@ -227,7 +227,7 @@ class EditorWidget(QWebView): # {{{ try: root = html.fromstring(raw) except: - root = soupparser.fromstring(raw) + root = fromstring(raw) elems = [] for body in root.xpath('//body'): diff --git a/src/calibre/utils/soupparser.py b/src/calibre/utils/soupparser.py new file mode 100644 index 0000000000..403f57baad --- /dev/null +++ b/src/calibre/utils/soupparser.py @@ -0,0 +1,126 @@ +__doc__ = """External interface to the BeautifulSoup HTML parser. +""" + +__all__ = ["fromstring", "parse", "convert_tree"] + +from lxml import etree, html +from calibre.ebooks.BeautifulSoup import \ + BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString + + +def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): + """Parse a string of HTML data into an Element tree using the + BeautifulSoup parser. + + Returns the root ```` Element of the tree. + + You can pass a different BeautifulSoup parser through the + `beautifulsoup` keyword, and a diffent Element factory function + through the `makeelement` keyword. By default, the standard + ``BeautifulSoup`` class and the default factory of `lxml.html` are + used. + """ + return _parse(data, beautifulsoup, makeelement, **bsargs) + +def parse(file, beautifulsoup=None, makeelement=None, **bsargs): + """Parse a file into an ElemenTree using the BeautifulSoup parser. + + You can pass a different BeautifulSoup parser through the + `beautifulsoup` keyword, and a diffent Element factory function + through the `makeelement` keyword. By default, the standard + ``BeautifulSoup`` class and the default factory of `lxml.html` are + used. + """ + if not hasattr(file, 'read'): + file = open(file) + root = _parse(file, beautifulsoup, makeelement, **bsargs) + return etree.ElementTree(root) + +def convert_tree(beautiful_soup_tree, makeelement=None): + """Convert a BeautifulSoup tree to a list of Element trees. + + Returns a list instead of a single root Element to support + HTML-like soup with more than one root element. + + You can pass a different Element factory through the `makeelement` + keyword. + """ + if makeelement is None: + makeelement = html.html_parser.makeelement + root = _convert_tree(beautiful_soup_tree, makeelement) + children = root.getchildren() + for child in children: + root.remove(child) + return children + + +# helpers + +def _parse(source, beautifulsoup, makeelement, **bsargs): + if beautifulsoup is None: + beautifulsoup = BeautifulSoup + if makeelement is None: + makeelement = html.html_parser.makeelement + if 'convertEntities' not in bsargs: + bsargs['convertEntities'] = 'html' + tree = beautifulsoup(source, **bsargs) + root = _convert_tree(tree, makeelement) + # from ET: wrap the document in a html root element, if necessary + if len(root) == 1 and root[0].tag == "html": + return root[0] + root.tag = "html" + return root + +def _convert_tree(beautiful_soup_tree, makeelement): + root = makeelement(beautiful_soup_tree.name, + attrib=dict(beautiful_soup_tree.attrs)) + _convert_children(root, beautiful_soup_tree, makeelement) + return root + +def _convert_children(parent, beautiful_soup_tree, makeelement): + SubElement = etree.SubElement + et_child = None + for child in beautiful_soup_tree: + if isinstance(child, Tag): + et_child = SubElement(parent, child.name, attrib=dict( + [(k, unescape(v)) for (k,v) in child.attrs])) + _convert_children(et_child, child, makeelement) + elif type(child) is NavigableString: + _append_text(parent, et_child, unescape(child)) + else: + if isinstance(child, Comment): + parent.append(etree.Comment(child)) + elif isinstance(child, ProcessingInstruction): + parent.append(etree.ProcessingInstruction( + *child.split(' ', 1))) + else: # CData + _append_text(parent, et_child, unescape(child)) + +def _append_text(parent, element, text): + if element is None: + parent.text = (parent.text or '') + text + else: + element.tail = (element.tail or '') + text + + +# copied from ET's ElementSoup + +try: + from html.entities import name2codepoint # Python 3 + name2codepoint +except ImportError: + from htmlentitydefs import name2codepoint +import re + +handle_entities = re.compile("&(\w+);").sub + +def unescape(string): + if not string: + return '' + # work around oddities in BeautifulSoup's entity handling + def unescape_entity(m): + try: + return unichr(name2codepoint[m.group(1)]) + except KeyError: + return m.group(0) # use as is + return handle_entities(unescape_entity, string)