From 0671f6088dbec2a99f1e20c09f61c0adc377d494 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Nov 2011 08:58:15 +0530 Subject: [PATCH 1/5] Fix Folha de Sao Paolo (subscription version) --- recipes/folhadesaopaulo_sub.recipe | 54 +++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/recipes/folhadesaopaulo_sub.recipe b/recipes/folhadesaopaulo_sub.recipe index 31ffb2db66..32dd347405 100644 --- a/recipes/folhadesaopaulo_sub.recipe +++ b/recipes/folhadesaopaulo_sub.recipe @@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe): __author__ = 'fluzao' description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' - INDEX = 'http://www1.folha.uol.com.br/fsp/indices/' + + #found this to be the easiest place to find the index page (13-Nov-2011). + # searching for the "Indice Geral" link + HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/' + masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' + language = 'pt' no_stylesheets = True max_articles_per_feed = 40 remove_javascript = True needs_subscription = True - remove_tags_before = dict(name='b') + + remove_tags_before = dict(name='p') remove_tags = [dict(name='td', attrs={'align':'center'})] remove_attributes = ['height','width'] - masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' - # fixes the problem with the section names section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \ 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \ 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \ - 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'} + 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \ + 'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \ + 'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'} # this solves the problem with truncated content in Kindle conversion_options = {'linearize_tables' : True} # this bit removes the footer where there are links for Proximo Texto, Texto Anterior, # Indice e Comunicar Erros - preprocess_regexps = [(re.compile(r'

Texto Anterior:.*', - re.DOTALL|re.IGNORECASE), lambda match: r''), - (re.compile(r'

Próximo Texto:.*', + preprocess_regexps = [(re.compile(r'.*Comunicar Erros', re.DOTALL|re.IGNORECASE), lambda match: r'')] def get_browser(self): @@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe): def parse_index(self): - soup = self.index_to_soup(self.INDEX) + #Searching for the index page on the HOMEPAGE + hpsoup = self.index_to_soup(self.HOMEPAGE) + indexref = hpsoup.find('a', href=re.compile('^indices.*')) + self.log('--> tag containing the today s index: ', indexref) + INDEX = indexref['href'] + INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX + self.log('--> INDEX after extracting href and adding prefix: ', INDEX) + # ... and taking the opportunity to get the cover image link + coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href'] + if coverurl: + self.log('--> tag containing the today s cover: ', coverurl) + coverurl = coverurl.replace('htm', 'jpg') + coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl + self.log('--> coverurl after extracting href and adding prefix: ', coverurl) + self.cover_url = coverurl + + #soup = self.index_to_soup(self.INDEX) + soup = self.index_to_soup(INDEX) + feeds = [] articles = [] section_title = "Preambulo" @@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe): self.log('--> new section title: ', section_title) if strpost.startswith(' post: ', post) @@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe): # keeping the front page url minha_capa = feeds[0][1][1]['url'] - # removing the 'Preambulo' section + # removing the first section (now called 'top') del feeds[0] - # creating the url for the cover image - coverurl = feeds[0][1][0]['url'] - coverurl = coverurl.replace('/opiniao/fz', '/images/cp') - coverurl = coverurl.replace('01.htm', '.jpg') - self.cover_url = coverurl - # inserting the cover page as the first article (nicer for kindle users) feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}])) return feeds + + From 69c20527f6399ec189c8e63459255c57cfca81c4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Nov 2011 09:28:34 +0530 Subject: [PATCH 2/5] When parsing for lxml via BeatifulSoup, use the calibre modified copy of BeautifulSoup (more robust). Fixes #889890 (Amazon metadata download BeautifulSoup error) --- src/calibre/ebooks/metadata/sources/amazon.py | 7 +- .../ebooks/metadata/sources/overdrive.py | 4 +- src/calibre/ebooks/mobi/reader.py | 6 +- src/calibre/ebooks/oeb/base.py | 4 +- src/calibre/gui2/comments_editor.py | 4 +- src/calibre/utils/soupparser.py | 126 ++++++++++++++++++ 6 files changed, 139 insertions(+), 12 deletions(-) create mode 100644 src/calibre/utils/soupparser.py diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 819cd674fc..52dd109b47 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -12,7 +12,7 @@ from urllib import urlencode from threading import Thread from Queue import Queue, Empty -from lxml.html import soupparser, tostring +from lxml.html import tostring from calibre import as_unicode from calibre.ebooks.metadata import check_isbn @@ -23,6 +23,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.library.comments import sanitize_comments_html from calibre.utils.date import parse_date from calibre.utils.localization import canonicalize_lang +from calibre.utils.soupparser import fromstring class Worker(Thread): # Get details {{{ @@ -199,7 +200,7 @@ class Worker(Thread): # Get details {{{ return try: - root = soupparser.fromstring(clean_ascii_chars(raw)) + root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse amazon details page: %r'%self.url self.log.exception(msg) @@ -623,7 +624,7 @@ class Amazon(Source): if found: try: - root = soupparser.fromstring(clean_ascii_chars(raw)) + root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse amazon page for query: %r'%query log.exception(msg) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 2e63a2e267..1164567ff5 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -14,13 +14,13 @@ from threading import RLock from Queue import Queue, Empty from lxml import html -from lxml.html import soupparser from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html +from calibre.utils.soupparser import fromstring ovrdrv_data_cache = {} cache_lock = RLock() @@ -403,7 +403,7 @@ class OverDrive(Source): raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: - root = soupparser.fromstring(raw) + root = fromstring(raw) except: return False diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 4e3430b1dc..5d12018121 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -353,14 +353,14 @@ class MobiReader(object): self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): - from lxml.html import soupparser + from calibre.utils.soupparser import fromstring self.log.warning('Malformed markup, parsing using BeautifulSoup') try: - root = soupparser.fromstring(self.processed_html) + root = fromstring(self.processed_html) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) - root = soupparser.fromstring(self.processed_html) + root = fromstring(self.processed_html) if root.tag != 'html': self.log.warn('File does not have opening tag') diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 57720e22f2..0daf0d4e7a 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -894,8 +894,8 @@ class Manifest(object): except etree.XMLSyntaxError as err: self.oeb.logger.warn('Parsing file %r as HTML' % self.href) if err.args and err.args[0].startswith('Excessive depth'): - from lxml.html import soupparser - data = soupparser.fromstring(data) + from calibre.utils.soupparser import fromstring + data = fromstring(data) else: data = html.fromstring(data) data.attrib.pop('xmlns', None) diff --git a/src/calibre/gui2/comments_editor.py b/src/calibre/gui2/comments_editor.py index a594af739e..58ff55e95c 100644 --- a/src/calibre/gui2/comments_editor.py +++ b/src/calibre/gui2/comments_editor.py @@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en' import re, os from lxml import html -from lxml.html import soupparser from PyQt4.Qt import QApplication, QFontInfo, QSize, QWidget, QPlainTextEdit, \ QToolBar, QVBoxLayout, QAction, QIcon, Qt, QTabWidget, QUrl, \ @@ -19,6 +18,7 @@ from PyQt4.QtWebKit import QWebView, QWebPage from calibre.ebooks.chardet import xml_to_unicode from calibre import xml_replace_entities from calibre.gui2 import open_url +from calibre.utils.soupparser import fromstring class PageAction(QAction): # {{{ @@ -227,7 +227,7 @@ class EditorWidget(QWebView): # {{{ try: root = html.fromstring(raw) except: - root = soupparser.fromstring(raw) + root = fromstring(raw) elems = [] for body in root.xpath('//body'): diff --git a/src/calibre/utils/soupparser.py b/src/calibre/utils/soupparser.py new file mode 100644 index 0000000000..403f57baad --- /dev/null +++ b/src/calibre/utils/soupparser.py @@ -0,0 +1,126 @@ +__doc__ = """External interface to the BeautifulSoup HTML parser. +""" + +__all__ = ["fromstring", "parse", "convert_tree"] + +from lxml import etree, html +from calibre.ebooks.BeautifulSoup import \ + BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString + + +def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): + """Parse a string of HTML data into an Element tree using the + BeautifulSoup parser. + + Returns the root ```` Element of the tree. + + You can pass a different BeautifulSoup parser through the + `beautifulsoup` keyword, and a diffent Element factory function + through the `makeelement` keyword. By default, the standard + ``BeautifulSoup`` class and the default factory of `lxml.html` are + used. + """ + return _parse(data, beautifulsoup, makeelement, **bsargs) + +def parse(file, beautifulsoup=None, makeelement=None, **bsargs): + """Parse a file into an ElemenTree using the BeautifulSoup parser. + + You can pass a different BeautifulSoup parser through the + `beautifulsoup` keyword, and a diffent Element factory function + through the `makeelement` keyword. By default, the standard + ``BeautifulSoup`` class and the default factory of `lxml.html` are + used. + """ + if not hasattr(file, 'read'): + file = open(file) + root = _parse(file, beautifulsoup, makeelement, **bsargs) + return etree.ElementTree(root) + +def convert_tree(beautiful_soup_tree, makeelement=None): + """Convert a BeautifulSoup tree to a list of Element trees. + + Returns a list instead of a single root Element to support + HTML-like soup with more than one root element. + + You can pass a different Element factory through the `makeelement` + keyword. + """ + if makeelement is None: + makeelement = html.html_parser.makeelement + root = _convert_tree(beautiful_soup_tree, makeelement) + children = root.getchildren() + for child in children: + root.remove(child) + return children + + +# helpers + +def _parse(source, beautifulsoup, makeelement, **bsargs): + if beautifulsoup is None: + beautifulsoup = BeautifulSoup + if makeelement is None: + makeelement = html.html_parser.makeelement + if 'convertEntities' not in bsargs: + bsargs['convertEntities'] = 'html' + tree = beautifulsoup(source, **bsargs) + root = _convert_tree(tree, makeelement) + # from ET: wrap the document in a html root element, if necessary + if len(root) == 1 and root[0].tag == "html": + return root[0] + root.tag = "html" + return root + +def _convert_tree(beautiful_soup_tree, makeelement): + root = makeelement(beautiful_soup_tree.name, + attrib=dict(beautiful_soup_tree.attrs)) + _convert_children(root, beautiful_soup_tree, makeelement) + return root + +def _convert_children(parent, beautiful_soup_tree, makeelement): + SubElement = etree.SubElement + et_child = None + for child in beautiful_soup_tree: + if isinstance(child, Tag): + et_child = SubElement(parent, child.name, attrib=dict( + [(k, unescape(v)) for (k,v) in child.attrs])) + _convert_children(et_child, child, makeelement) + elif type(child) is NavigableString: + _append_text(parent, et_child, unescape(child)) + else: + if isinstance(child, Comment): + parent.append(etree.Comment(child)) + elif isinstance(child, ProcessingInstruction): + parent.append(etree.ProcessingInstruction( + *child.split(' ', 1))) + else: # CData + _append_text(parent, et_child, unescape(child)) + +def _append_text(parent, element, text): + if element is None: + parent.text = (parent.text or '') + text + else: + element.tail = (element.tail or '') + text + + +# copied from ET's ElementSoup + +try: + from html.entities import name2codepoint # Python 3 + name2codepoint +except ImportError: + from htmlentitydefs import name2codepoint +import re + +handle_entities = re.compile("&(\w+);").sub + +def unescape(string): + if not string: + return '' + # work around oddities in BeautifulSoup's entity handling + def unescape_entity(m): + try: + return unichr(name2codepoint[m.group(1)]) + except KeyError: + return m.group(0) # use as is + return handle_entities(unescape_entity, string) From 4409c8c4131056b17eb9ee4b3ae40c2fe4e2a065 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Nov 2011 09:31:04 +0530 Subject: [PATCH 3/5] Fix #889987 (stopped recognizing LG android phone with Word Player. Used to work.) --- src/calibre/devices/android/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index e083b38490..20ee9dde5d 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -106,7 +106,7 @@ class ANDROID(USBMS): 0x61c5 : [0x100, 0x226, 0x9999], 0x61cc : [0x100], 0x61ce : [0x100], - 0x618e : [0x226, 0x9999, 0x100] + 0x618e : [0x226, 0x227, 0x9999, 0x100] }, # Archos From 798ad0002921bca1e8c3d34f0049047d3450195c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Nov 2011 09:47:00 +0530 Subject: [PATCH 4/5] Update cgm.pl, historia.pl and tablety.pl --- recipes/cgm_pl.recipe | 2 +- recipes/historia_pl.recipe | 11 ++++++++++- recipes/tablety_pl.recipe | 5 +++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index 485cf45245..591155ff85 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -27,7 +27,7 @@ class CGM(BasicNewsRecipe): del item['style'] ad=soup.findAll('a') for r in ad: - if 'http://www.hustla.pl' in r['href']: + if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']: r.extract() gallery=soup.find('div', attrs={'class':'galleryFlash'}) if gallery: diff --git a/recipes/historia_pl.recipe b/recipes/historia_pl.recipe index 26cda733b2..34ca158a96 100644 --- a/recipes/historia_pl.recipe +++ b/recipes/historia_pl.recipe @@ -8,6 +8,15 @@ class Historia_org_pl(BasicNewsRecipe): category = 'history' language = 'pl' oldest_article = 8 + remove_empty_feeds=True max_articles_per_feed = 100 - feeds = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')] + feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'), + (u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'), + (u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'), + (u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'), + (u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'), + (u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'), + (u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'), + (u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'), + (u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')] diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index af317d1b09..d06e32d9af 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -9,6 +9,7 @@ class Tablety_pl(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - keep_only_tags=[dict(name='header', attrs={'class':'entry-header'}), dict(name='div', attrs={'class':'entry-content clearfix'})] - remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'}), dict(name='span', attrs={'class':'dsq-postid'})] + remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) + remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) + remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] From 6f420c50e1efd2a1d11e9281bc45219e12ee98b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Nov 2011 10:07:43 +0530 Subject: [PATCH 5/5] Infra.pl and Spider's Web by fenuks --- recipes/icons/infra_pl.png | Bin 0 -> 1521 bytes recipes/infra_pl.recipe | 17 +++++++++++++++++ recipes/spiders_web_pl.png | Bin 0 -> 605 bytes recipes/spiders_web_pl.recipe | 15 +++++++++++++++ 4 files changed, 32 insertions(+) create mode 100644 recipes/icons/infra_pl.png create mode 100644 recipes/infra_pl.recipe create mode 100644 recipes/spiders_web_pl.png create mode 100644 recipes/spiders_web_pl.recipe diff --git a/recipes/icons/infra_pl.png b/recipes/icons/infra_pl.png new file mode 100644 index 0000000000000000000000000000000000000000..5607a7c98360a4f3928a697cc25830a7c642ae80 GIT binary patch literal 1521 zcmZ`(e>Bqz6#s@a<;RMn(NYsVYq8CbjCzY9vy8P@`U>z-YtP-YaML%-CulLGN7z8i8(N+wS@RU@o;(zA8?pQ1UtVa5IdmhB} z5IKJMLpjjS_V#yUnlGEaE^*~Boj(d|!8SpnTTHt%`gVm|EzMtgM39{AS#eKlYe6tp z>2n{?i@%YMIBzho@VNmqwmtqq;HuDQMm3w3&);9F8K7KwN~ty(4*@m8%PyuC8=wt= zp-xCH8IbYxz}99u9-xji(T6>3gZ#SUPJoFb0zEo^XE6c5-_(Nu-a9%N_4qA)5^hqGGv-DtcIjQ;raJ=0!*;Zb0ykXFJxow zP38AY-te&1UVh`}nVO1Qw-HDr;&%Pg?4&YKE>511ZB_*BxTFg{liD*LLU>xZ?`cF* zC=BXb7&oXEI+!tgsUQh=FHO|Yq#6~wrS3pqZ&*xSSKh9jqZoGN3OTw(EhC&%oUW(feJqeJP8)K=TJG0*ni*HY$tGo*N*X;DT<{pHIzPmsdbu8hnoefhFpi=>RD*@A+H z4~bRfNLT6W*Nn=hiLSe$Rov`<7U!myvw3!*!jIqP)XJEgmA7wud!OWAz+1z3%1O1C z*WT=}^TsACOHNJ`nwI#H`h$LcdzL>9JUXdbyl9)bkf1Ow4M@%Y#)zGkR+*ZPjwX)v z59tCn?1G4qva)U49zSl+-t5kp-$~eR5fwMY7sU?o?Ga9fOaj4f;FO%pVhs-;l_x0L zGF>+mbxo)wl9H0a!6E}3^W#)ZV$LGU%+hLE;|lTL&hExPb6%GpeCO)bwl;x4teKnc z?#|0=&$*yP-w;Irb$wY`fi>7ujJZ!^kBLN%H63#J{w|lz7!s{LVf530R4PuN8Bt=q zyxMcRB3G7M`4QkqO55+H9D#tDVI|>E%V(-?#48>SC=^5IjC7;4dkJs6OKXap`t!Q) zVQK)EG3vklQf#a^Lp3Zy(!MBiRFS7ohlhuEs4+^I;)GcNF+46UL`=c^v z*P(uI>*#2HmE5U3K+FdpZV7pi^r^qnWp{eQh(5ST?<_7n4)PX5MRZmuTls_JQ{2He zdBq@`ZE9MB72Q{~3gRd}KGBn92qJNfiA**>m!+9kMQP7*VPU3b@51~MM}c5vd1|FQ z8QPW1Ars&n>G4K0T=nh%OdO?rbw}%r?y=Ci^Wa3q2?pZ?!4CveWu&F0%~iXpmGT-| z7&iUOn^vD$JD!NFHQVAqFmDiy84fg-IU4pE>InxeVE_)L-%BL`Okr3{Tu-^Q%E>}z z(Kp=e?_^1|*-#YXP=hpMRnUvk64py?2lZeaD6g3Ut9up`Lj!{H?GIo z)yV11uahvA_6RmP79KjJ<}`oy=xEv8m1eO{pV)_Dt?nfHtZX^{$o#JdtDZS(f2b|P zCzwHxU_`q|r9^85a73Ze_CLb;AR6V4+U4$uL1-C;!mD$w|6_I(RGT!s zk}o*k3QPaC|I|VDF5YYNuh|M{c}`o1V4tqU#|&35|r< z^YtQ0i}%%L6@+VtSnl9In7{96K+#cN{+izMt*z^*&1;|I%uReW1^GJasqxgavdMwLO`s@h>W_$*RdP`( zkYX@0Ff`N!A(Ie811ke_D+6O~12ZcFgXh9KW}|4x%}>cptHiBgnjeEYPy>UftDnm{ Hr-UW|$dvye literal 0 HcmV?d00001 diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe new file mode 100644 index 0000000000..d615f01aa9 --- /dev/null +++ b/recipes/spiders_web_pl.recipe @@ -0,0 +1,15 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class SpidersWeb(BasicNewsRecipe): + title = u"Spider's Web" + oldest_article = 7 + __author__ = 'fenuks' + description = u'' + cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' + category = 'IT, WEB' + language = 'pl' + max_articles_per_feed = 100 + remove_tags_before=dict(name="h1", attrs={'class':'Title'}) + remove_tags_after=dict(name="div", attrs={'class':'Text'}) + remove_tags=[dict(name='div', attrs={'class':['Tags', 'CommentCount FloatL', 'Show FloatL']})] + feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')]