diff --git a/Changelog.yaml b/Changelog.yaml index a7fc86c98e..9b62adff7b 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,189 @@ # new recipes: # - title: +- version: 0.8.43 + date: 2012-03-16 + + new features: + - title: "Template language: Speedup evaluation of general program mode templates by pre-compiling them to python. If you experience errors with this optimization, you can turn it off via Preferences->Tweaks. Also other miscellaneous optimizations in evaluating templates with composite columns." + + - title: "MOBI Output: Add an option to not convert all images to JPEG when creating MOBI files. For maximum compatibility of the produced MOBI files, do not use this option." + tickets: [954025] + + - title: "Add iPad3 Output Profile" + + bug fixes: + - title: "KF8 Input: Add support for KF8 files with obfuscated embedded fonts" + tickets: [953260] + + - title: "Make the stars in the book list a little larger on windows >= vista" + + - title: "Revised periodical Section layout, for touchscreen devices resolving iBooks problem with tables spanning multiple pages" + + - title: "Read dc:contributor metadata from MOBI files" + + - title: "MOBI Output: Fix a regression that caused the generated thumbnail embedded in calibre produced MOBI files to be a large, low quality image instead of a small, high quality image. You would have been affected by this bug only if you directly used the output from calibre, without exporting it via send to device or save to disk." + tickets: [954254] + + - title: "KF8 Input: Recognize OpenType embedded fonts as well." + tickets: [954728] + + - title: "Fix regression in 0.8.41 that caused file:/// URLs to stop working in the news download system on windows." + tickets: [955581] + + - title: "When setting metadata in MOBI files fix cover not being updated if the mobi file has its first image record as the cover" + + - title: "Fix column coloring rules based on the size column not working" + tickets: [953737] + + improved recipes: + - Microwaves and RF + - idg.se + + new recipes: + - title: SatMagazine + author: kiavash + +- version: 0.8.42 + date: 2012-03-12 + + new features: + - title: "Support for reading Amazon's new KF8 format" + type: major + description: "calibre can now both view and convert MOBI files that contain Amazon's new KF8 (Kindle Fire) format" + + - title: "Add a tweak to Preferences->Tweaks to control the font size used in the book details panel" + tickets: [948357] + + - title: "Allow specifying a list of file types to exclude when automatically adding files from a folder" + tickets: [943025] + + - title: "Show ratings in the book details panel as stars. Also allow the user to change the alignment of the ratings column in the main books list. No longer display the stars in blue, instead their color can be customized via the column coloring rules, like any other column" + + - title: "When setting metadata in EPUB ensure that the tag has its name attribute first. Needed for the Nook." + + - title: "Drivers for Novo 7, LG G2x and Zenithink T-280" + tickets: [941671, 940625, 940527] + + - title: "Update linux binaries to Qt 4.8.0" + + bug fixes: + - title: "Fix some rar files causing crashes on OS X (updated libunrar.dylib in the OS X build)" + tickets: [951185] + + - title: "MOBI Output: Ignore the Table of Contents pointed to by the guide, if it contains no links" + + - title: "ODT Input: Ignore margin declaration in ODT styles if more specific margin-* declarations are present" + tickets: [941134] + + - title: "Conversion pipeline: Fix @import rules in CSS stylesheets that have comments on their first few lines being ignored." + + - title: "EPUB Input: When extracting the contents of epub files on windows, do not error out if one or more of the components in the epub file have filepaths containing characters that are invalid for the windows filesystem, instead, just replace those characters, since those entries are likely to be errors in the zip container anyway." + tickets: [950081] + + - title: "Textile output: Fix issue with blockquotes and sentences getting removed." + + - title: "MOBI Output: When using the prefer author sort conversion option, handle multiple authors better." + tickets: [947146] + + - title: "Fix regression in 0.8.41 that broke direct connection to iDevices in windows" + tickets: [944534] + + - title: "Fix the download bulk metadata completed popup causing a crash if the Esc key is pressed." + tickets: [943056] + + - title: "Fix rating values doubled in CSV/XML catalogs" + tickets: [942790] + + - title: "EPUB Input: Remove non markup documents from the spine automatically, instead of erroring out" + + - title: "When formatting ratings in templates, etc., do not have an unnecessary .0" + + - title: "Calibre portable: Do not allow calibre portable to run if it is placed in a location whose path is too long. Also hide the library location setup in the welcome wizard when running the portable build." + + - title: "Fix regression in 0.8.41 that broke calibre if the TMP or TEMP environment variable is set to the root of a drive." + tickets: [952284] + + - title: "Fix display of ratings type custom fields in the content server" + tickets: [940600] + + + improved recipes: + - La Jornada + - Chicago Tribune + - Mediapart + - rue89 + + new recipes: + - title: Racjonalista + author: Racjonlista + + - title: JAPAA + author: adoucette + + +- version: 0.8.41 + date: 2012-02-24 + + new features: + - title: "Driver for Sony Experia Play 4G" + tickets: [938831] + + - title: "News download system: Allow use of __future__ in recipes, and do not change line numbers of code in the recipe when compiling it" + + - title: "Use the My Documents folder as the default location for the Calibre Library folder on first start in windows" + tickets: [934840] + + - title: "Add a tweak to Preferences->Tweaks to control the order in which categories appear in the Tag Browser" + + - title: "Tag Browser: Add an entry to the right click menu to quickly delete tags" + tickets: [934509] + + - title: "Amazon metadata download: Try to scrape series information from the amazon details page. Note that currently very few books have series info available. Often the page for hardcover will have series, but the Kindle edition will not. In such cases calibre may or may not find the series, depending on which page it ends up using." + + - title: "Content server: Add favicon to OPDS feeds." + tickets: [934731] + + bug fixes: + - title: "RTF Input: Fix some WMF images embedded in RTF files being distorted on conversion." + tickets: [934167] + + - title: "Fix long standing bug preventing calibre from working on east asian windows installs when the user name in windows has non-ascii characters" + tickets: [937389] + + - title: "Get Books: Fix Baen Webscription and O'Reilly stores. Fix price detection for Google Books" + + - title: "MOBI Output: When the same anchor is present more than once in the input document, use the first occurrence rather than the last one." + tickets: [934031] + + - title: "Use the 'default cover font' tweak when generating default masthead images as well" + tickets: [939256] + + - title: "Fix content server does not correctly display custom field of type 'rating'" + tickets: [938303] + + - title: "Fix welcome wizard does not save send-from email info unless send-to field is filled" + tickets: [937087] + + - title: "When reading metadata from odt files, use initial-creator in preference to creator for setting the author field" + tickets: [934564] + + - title: "Fix conversion erroring out when the input document has very long and thin images" + tickets: [935234] + + improved recipes: + - The Sun + - Various Polish news sources + - Mediapart + + new recipes: + - title: La pausa caffe + author: faber1971 + + - title: Various Polish news sources + author: fenuks + + - version: 0.8.40 date: 2012-02-17 diff --git a/imgsrc/calibreSymbols.spd b/imgsrc/calibreSymbols.spd new file mode 100644 index 0000000000..1ef6f532c5 --- /dev/null +++ b/imgsrc/calibreSymbols.spd @@ -0,0 +1,152 @@ +SplineFontDB: 3.0 +FontName: calibreSymbols +FullName: calibre Symbols +FamilyName: calibre Symbols +Weight: Medium +Copyright: Created by Kovid Goyal with FontForge 2.0 (http://fontforge.sf.net) +UComments: "2012-2-27: Created." +Version: 001.000 +ItalicAngle: 0 +UnderlinePosition: -100 +UnderlineWidth: 50 +Ascent: 800 +Descent: 200 +LayerCount: 2 +Layer: 0 0 "Back" 1 +Layer: 1 0 "Fore" 0 +NeedsXUIDChange: 1 +XUID: [1021 913 325894820 11538708] +FSType: 0 +OS2Version: 0 +OS2_WeightWidthSlopeOnly: 0 +OS2_UseTypoMetrics: 1 +CreationTime: 1330331997 +ModificationTime: 1330487767 +OS2TypoAscent: 0 +OS2TypoAOffset: 1 +OS2TypoDescent: 0 +OS2TypoDOffset: 1 +OS2TypoLinegap: 90 +OS2WinAscent: 0 +OS2WinAOffset: 1 +OS2WinDescent: 0 +OS2WinDOffset: 1 +HheadAscent: 0 +HheadAOffset: 1 +HheadDescent: 0 +HheadDOffset: 1 +MarkAttachClasses: 1 +DEI: 91125 +Encoding: UnicodeFull +UnicodeInterp: none +NameList: Adobe Glyph List +DisplaySize: -24 +AntiAlias: 1 +FitToEm: 1 +WidthSeparation: 150 +WinInfo: 9600 75 22 +BeginPrivate: 0 +EndPrivate +BeginChars: 1114112 3 + +StartChar: uni2605 +Encoding: 9733 9733 0 +Width: 979 +VWidth: -26 +Flags: W +LayerCount: 2 +Fore +SplineSet +551.923 352.862 m 1 + 749.497 369.592 l 2 + 804.954 374.123 833.379 376.389 834.765 376.389 c 0 + 852.095 376.389 860.761 368.896 860.761 353.907 c 0 + 860.761 347.981 859.028 343.363 855.562 340.052 c 0 + 852.095 336.74 825.578 319.225 776.012 287.506 c 2 + 609.635 180.323 l 1 + 716.22 -88.417 l 2 + 717.606 -91.2051 718.301 -95.3877 718.301 -100.965 c 0 + 718.301 -106.193 716.394 -110.725 712.58 -114.558 c 0 + 708.769 -118.393 704.608 -120.31 700.104 -120.31 c 0 + 695.943 -120.31 691.61 -118.828 687.103 -115.866 c 0 + 682.598 -112.902 658.162 -92.251 613.795 -53.9082 c 2 + 466.134 74.71 l 1 + 320.554 -51.8184 l 2 + 274.802 -91.5547 249.758 -112.902 245.426 -115.866 c 0 + 241.092 -118.828 236.846 -120.31 232.688 -120.31 c 0 + 227.835 -120.31 223.415 -118.306 219.429 -114.297 c 0 + 215.442 -110.289 213.449 -105.844 213.449 -100.965 c 0 + 213.449 -97.8281 223.329 -71.3379 243.087 -21.4932 c 2 + 322.115 180.323 l 1 + 152.618 289.598 l 2 + 104.783 320.271 79.2217 337.176 75.9297 340.313 c 0 + 72.6357 343.45 70.9893 347.981 70.9893 353.907 c 0 + 70.9893 369.243 79.8291 376.912 97.5059 376.912 c 0 + 98.8926 376.912 123.155 374.82 170.296 370.638 c 2 + 379.825 352.862 l 1 + 427.14 555.201 l 2 + 439.271 607.834 446.811 636.764 449.757 641.992 c 0 + 452.702 647.221 458.162 649.834 466.134 649.834 c 0 + 474.454 649.834 480 646.96 482.772 641.208 c 0 + 485.545 635.457 493.518 604.173 506.689 547.357 c 2 + 551.923 352.862 l 1 +EndSplineSet +Validated: 524289 +EndChar + +StartChar: zero +Encoding: 48 48 1 +Width: 1303 +VWidth: 2048 +Flags: W +HStem: -43.3789 76.7998<582.097 721.09> 623.341 76.7998<582.097 721.091> +VStem: 403.82 97.4395<148.044 508.66> 802.221 96.959<148.044 508.659> +LayerCount: 2 +Fore +SplineSet +651.5 623.341 m 0 + 601.58 623.341 564.061 598.78 538.939 549.66 c 0 + 513.82 500.541 501.26 426.7 501.26 328.141 c 0 + 501.26 229.9 513.82 156.221 538.939 107.101 c 0 + 564.061 57.9805 601.58 33.4209 651.5 33.4209 c 0 + 701.74 33.4209 739.42 57.9805 764.54 107.101 c 0 + 789.66 156.221 802.221 229.9 802.221 328.141 c 0 + 802.221 426.7 789.66 500.541 764.54 549.66 c 0 + 739.42 598.78 701.74 623.341 651.5 623.341 c 0 +651.5 700.141 m 0 + 731.82 700.141 793.18 668.38 835.58 604.859 c 0 + 877.979 541.341 899.18 449.101 899.18 328.141 c 0 + 899.18 207.5 877.979 115.421 835.58 51.9004 c 0 + 793.18 -11.6201 731.819 -43.3789 651.5 -43.3789 c 0 + 571.18 -43.3789 509.82 -11.6201 467.42 51.9004 c 0 + 425.021 115.421 403.82 207.5 403.82 328.141 c 0 + 403.82 449.101 425.021 541.341 467.42 604.859 c 0 + 509.82 668.38 571.18 700.141 651.5 700.141 c 0 +EndSplineSet +Validated: 1 +EndChar + +StartChar: period +Encoding: 46 46 2 +Width: 516 +VWidth: 2048 +Flags: W +HStem: 53.4004 166.199<203.263 309.297> +VStem: 174.6 163.801<82.9501 190.955> +LayerCount: 2 +Fore +SplineSet +338.4 142.8 m 0 + 338.4 119.2 330.5 98.4004 314.7 80.4004 c 0 + 298.9 62.4004 277 53.4004 249 53.4004 c 0 + 225.4 53.4004 207.1 61.2002 194.1 76.7998 c 0 + 181.1 92.4004 174.6 111 174.6 132.6 c 0 + 174.6 155.8 182.6 176.1 198.6 193.5 c 0 + 214.6 210.9 236.8 219.6 265.2 219.6 c 0 + 288.8 219.6 306.9 212.2 319.5 197.4 c 0 + 332.1 182.6 338.4 164.4 338.4 142.8 c 0 +EndSplineSet +Validated: 1 +EndChar +EndChars +EndSplineFont diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe index 3c93d3644f..e121ba4d42 100644 --- a/recipes/archeowiesci.recipe +++ b/recipes/archeowiesci.recipe @@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe): language = 'pl' cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' oldest_article = 7 + needs_subscription='optional' max_articles_per_feed = 100 auto_cleanup = True remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})] @@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe): feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: for article in feed.articles[:]: - if 'subskrypcja' in article.title: + if self.username is None and 'subskrypcja' in article.title: feed.articles.remove(article) return feeds + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://archeowiesci.pl/wp-login.php') + br.select_form(name='loginform') + br['log'] = self.username + br['pwd'] = self.password + br.submit() + return br \ No newline at end of file diff --git a/recipes/astronomia_pl.recipe b/recipes/astronomia_pl.recipe index a142520ec5..89a0e4c889 100644 --- a/recipes/astronomia_pl.recipe +++ b/recipes/astronomia_pl.recipe @@ -1,15 +1,18 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Astronomia_pl(BasicNewsRecipe): title = u'Astronomia.pl' __author__ = 'fenuks' description = 'Astronomia - polish astronomy site' + masthead_url = 'http://www.astronomia.pl/grafika/logo.gif' cover_url = 'http://www.astronomia.pl/grafika/logo.gif' category = 'astronomy, science' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - #no_stylesheets=True + extra_css='#h2 {font-size: 18px;}' + no_stylesheets=True + preprocess_regexps = [(re.compile(ur'Przeczytaj także:.*?', re.DOTALL), lambda match: '') ] remove_tags_before=dict(name='div', attrs={'id':'a1'}) keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})] feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index d5b4997aa7..cc74cc9128 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' + masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets=True - preprocess_regexps = [(re.compile(ur'\bWięcej o .*', re.DOTALL|re.IGNORECASE), lambda match: '')] + preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/biolog_pl.recipe b/recipes/biolog_pl.recipe index af9ad77e44..b10bf0d925 100644 --- a/recipes/biolog_pl.recipe +++ b/recipes/biolog_pl.recipe @@ -10,10 +10,11 @@ class Biolog_pl(BasicNewsRecipe): description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.' category = 'biology' language = 'pl' + masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png' cover_url='http://www.biolog.pl/naukowy,portal,biolog.png' no_stylesheets = True #keeps_only_tags=[dict(id='main')] remove_tags_before=dict(id='main') remove_tags_after=dict(name='a', attrs={'name':'komentarze'}) - remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})] + remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})] feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')] diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe index b4cf6b326c..ff46774dc9 100644 --- a/recipes/cd_action.recipe +++ b/recipes/cd_action.recipe @@ -1,16 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe - class CD_Action(BasicNewsRecipe): title = u'CD-Action' __author__ = 'fenuks' - description = 'cdaction.pl - polish magazine about games site' + description = 'cdaction.pl - polish games magazine site' category = 'games' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG' keep_only_tags= dict(id='news_content') remove_tags_after= dict(name='div', attrs={'class':'tresc'}) feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')] + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.cdaction.pl/magazyn/') + self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index 591155ff85..673a9f940b 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -5,6 +5,7 @@ class CGM(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'Codzienna Gazeta Muzyczna' + masthead_url='http://www.cgm.pl/img/header/logo.gif' cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg' category = 'music' language = 'pl' @@ -23,21 +24,19 @@ class CGM(BasicNewsRecipe): def preprocess_html(self, soup): + gallery=soup.find('div', attrs={'class':'galleryFlash'}) + if gallery: + img=gallery.div + gallery.img.extract() + if img: + img=img['style'] + img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')] + gallery.contents[1].name='img' + gallery.contents[1]['src']=img for item in soup.findAll(style=True): del item['style'] ad=soup.findAll('a') for r in ad: - if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']: + if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']: r.extract() - gallery=soup.find('div', attrs={'class':'galleryFlash'}) - if gallery: - img=gallery.find('embed') - if img: - img=img['src'][35:] - img='http://www.cgm.pl/_vault/_gallery/_photo/'+img - param=gallery.findAll(name='param') - for i in param: - i.extract() - gallery.contents[1].name='img' - gallery.contents[1]['src']=img return soup \ No newline at end of file diff --git a/recipes/chicago_tribune.recipe b/recipes/chicago_tribune.recipe index a5ec8f0743..684993e251 100644 --- a/recipes/chicago_tribune.recipe +++ b/recipes/chicago_tribune.recipe @@ -3,6 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import urllib, re from calibre.web.feeds.news import BasicNewsRecipe class ChicagoTribune(BasicNewsRecipe): @@ -77,10 +78,17 @@ class ChicagoTribune(BasicNewsRecipe): def get_article_url(self, article): - url = article.get('feedburner_origlink', article.get('guid', article.get('link'))) - if url.endswith('?track=rss'): - url = url.partition('?')[0] - return url + ans = None + try: + s = article.summary + ans = urllib.unquote( + re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) + except: + pass + if ans is None: + ans = article.get('feedburner_origlink', article.get('guid', article.get('link'))) + if ans is not None: + return ans.replace('?track=rss', '') def skip_ad_pages(self, soup): text = soup.find(text='click here to continue to article') diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index 6f41b95763..50b626fcbf 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe): remove_javascript = True no_stylesheets = True + requires_version = (0, 8, 39) + + def preprocess_raw_html(self, raw, url): + try: + from html5lib import parse + root = parse(raw, namespaceHTMLElements=False, + treebuilder='lxml').getroot() + from lxml import etree + for tag in root.xpath( + '//script|//style|//noscript|//meta|//link|//object'): + tag.getparent().remove(tag) + for elem in list(root.iterdescendants(tag=etree.Comment)): + elem.getparent().remove(elem) + ans = etree.tostring(root, encoding=unicode) + ans = re.sub('.*', lambda match : ''), - (r'
.*?
', lambda m: ''), - (r'Full HTML version of this story which may include photos, graphics, and related links.*', - lambda match : ''), - ]] extra_css = ''' h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large} .sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;} diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe new file mode 100644 index 0000000000..7c5138196d --- /dev/null +++ b/recipes/ciekawostki_historyczne.recipe @@ -0,0 +1,48 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Ciekawostki_Historyczne(BasicNewsRecipe): + title = u'Ciekawostki Historyczne' + oldest_article = 7 + __author__ = 'fenuks' + description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.' + category = 'history' + language = 'pl' + masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:

.*?', re.DOTALL), lambda match: '')] + no_stylesheets=True + remove_empty_feeds=True + keep_only_tags=[dict(name='div', attrs={'class':'post'})] + remove_tags=[dict(id='singlepostinfo')] + feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='h7') + if tag: + if tag.br: + pass + elif tag.nextSibling.name=='p': + tag=tag.nextSibling + nexturl = tag.findAll('a') + for nextpage in nexturl: + tag.extract() + nextpage= nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(name='div', attrs={'class':'post'}) + for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}): + r.extract() + for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}): + r.extract() + for r in pagetext.findAll('h1'): + r.extract() + pagetext.find('h6').nextSibling.extract() + pagetext.find('h7').nextSibling.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup + + \ No newline at end of file diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 90b7d63c56..2ec457e4de 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -7,10 +7,11 @@ class Computerworld_pl(BasicNewsRecipe): description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne' category = 'IT' language = 'pl' + masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 - keep_only_tags=[dict(name='div', attrs={'id':'s'})] + keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})] remove_tags_after=dict(name='div', attrs={'class':'rMobi'}) remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 72f9c966bd..a27a9b0877 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): __licence__ ='GPL v3' category = 'IT' language = 'pl' + masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' encoding = 'utf-8' @@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'
Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...
'), lambda match: '') ] - remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] - keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})] + keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] + remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})] + #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe index b5453659ef..6da7e0240d 100644 --- a/recipes/dziennik_pl.recipe +++ b/recipes/dziennik_pl.recipe @@ -8,15 +8,17 @@ class Dziennik_pl(BasicNewsRecipe): description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.' category = 'newspaper' language = 'pl' - cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg' + masthead_url= 'http://5.s.dziennik.pl/images/logos.png' + cover_url= 'http://5.s.dziennik.pl/images/logos.png' no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 remove_javascript=True remove_empty_feeds=True - preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')] + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' + preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('

>>> CZYTAJ TAKŻE: ".*?"

'), lambda m: '')] keep_only_tags=[dict(id='article')] - remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})] + remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'), (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'), @@ -30,6 +32,12 @@ class Dziennik_pl(BasicNewsRecipe): (u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'), (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')] + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + def append_page(self, soup, appendtag): tag=soup.find('a', attrs={'class':'page_next'}) if tag: @@ -56,3 +64,4 @@ class Dziennik_pl(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup + diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 1c72e5704e..0671deec6c 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -10,7 +10,8 @@ class Filmweb_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True - extra_css = '.hdrBig {font-size:22px;}' + remove_empty_feeds=True + extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe new file mode 100644 index 0000000000..f3384263d6 --- /dev/null +++ b/recipes/gameplay_pl.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Gameplay_pl(BasicNewsRecipe): + title = u'Gameplay.pl' + oldest_article = 7 + __author__ = 'fenuks' + description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.' + category = 'games, movies, books, music' + language = 'pl' + masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png' + cover_url= 'http://gameplay.pl/img/gpy_top_logo.png' + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})] + remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})] + feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')] + + def image_url_processor(self, baseurl, url): + if 'http' not in url: + return 'http://gameplay.pl'+ url[2:] + else: + return url diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 0959ff80a3..489caf231f 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta Wyborcza' __author__ = 'fenuks' - cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' language = 'pl' description ='news from gazeta.pl' category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX='http://wyborcza.pl' remove_empty_feeds= True oldest_article = 3 @@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe): return url else: return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + + def get_cover_url(self): + soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') + cover=soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) + self.cover_url='http://wyborcza.pl' + soup.img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index d9c461dc63..e188e4988c 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -8,29 +8,31 @@ class Gry_online_pl(BasicNewsRecipe): language = 'pl' oldest_article = 13 INDEX= 'http://www.gry-online.pl/' - cover_url='http://www.gry-online.pl/img/1st_10/1st-gol-logo.png' + masthead_url='http://www.gry-online.pl/im/gry-online-logo.png' + cover_url='http://www.gry-online.pl/im/gry-online-logo.png' max_articles_per_feed = 100 no_stylesheets= True - extra_css = 'p.wn1{font-size:22px;}' - remove_tags_after= [dict(name='div', attrs={'class':['tresc-newsa']})] - keep_only_tags = [dict(name='div', attrs={'class':['txthead']}), dict(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}), dict(name='a', attrs={'class':['num_str_nex']})] - #remove_tags= [dict(name='div', attrs={'class':['news_plat']})] + keep_only_tags=[dict(name='div', attrs={'class':'gc660'})] + remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] def append_page(self, soup, appendtag): - nexturl = soup.find('a', attrs={'class':'num_str_nex'}) - if appendtag.find('a', attrs={'class':'num_str_nex'}) is not None: - appendtag.find('a', attrs={'class':'num_str_nex'}).replaceWith('\n') - if nexturl is not None: - if 'strona' in nexturl.div.string: - nexturl= self.INDEX + nexturl['href'] - soup2 = self.index_to_soup(nexturl) - pagetext = soup2.findAll(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}) - for tag in pagetext: - pos = len(appendtag.contents) - appendtag.insert(pos, tag) - self.append_page(soup2, appendtag) + tag = appendtag.find('div', attrs={'class':'n5p'}) + if tag: + nexturls=tag.findAll('a') + for nexturl in nexturls[1:]: + try: + soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) + except: + soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + pagetext = soup2.find(attrs={'class':'gc660'}) + for r in pagetext.findAll(name='header'): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + r.extract() def preprocess_html(self, soup): diff --git a/recipes/icons/ciekawostki_historyczne.png b/recipes/icons/ciekawostki_historyczne.png new file mode 100644 index 0000000000..fa0e2c0591 Binary files /dev/null and b/recipes/icons/ciekawostki_historyczne.png differ diff --git a/recipes/icons/gameplay_pl.png b/recipes/icons/gameplay_pl.png new file mode 100644 index 0000000000..1b7081f393 Binary files /dev/null and b/recipes/icons/gameplay_pl.png differ diff --git a/recipes/icons/in4_pl.png b/recipes/icons/in4_pl.png new file mode 100644 index 0000000000..b3351629f0 Binary files /dev/null and b/recipes/icons/in4_pl.png differ diff --git a/recipes/icons/informacje_usa.png b/recipes/icons/informacje_usa.png new file mode 100644 index 0000000000..4c30e3bcbc Binary files /dev/null and b/recipes/icons/informacje_usa.png differ diff --git a/recipes/icons/kresy_pl.png b/recipes/icons/kresy_pl.png new file mode 100644 index 0000000000..db8ef4efec Binary files /dev/null and b/recipes/icons/kresy_pl.png differ diff --git a/recipes/icons/mediapart.png b/recipes/icons/mediapart.png new file mode 100644 index 0000000000..ab489d3db7 Binary files /dev/null and b/recipes/icons/mediapart.png differ diff --git a/recipes/icons/oclab_pl.png b/recipes/icons/oclab_pl.png new file mode 100644 index 0000000000..45ecd2533e Binary files /dev/null and b/recipes/icons/oclab_pl.png differ diff --git a/recipes/icons/overclock_pl.png b/recipes/icons/overclock_pl.png new file mode 100644 index 0000000000..38c0b13bfe Binary files /dev/null and b/recipes/icons/overclock_pl.png differ diff --git a/recipes/icons/palmtop_pl.png b/recipes/icons/palmtop_pl.png new file mode 100644 index 0000000000..d711a41682 Binary files /dev/null and b/recipes/icons/palmtop_pl.png differ diff --git a/recipes/icons/pc_arena.png b/recipes/icons/pc_arena.png new file mode 100644 index 0000000000..10be204b36 Binary files /dev/null and b/recipes/icons/pc_arena.png differ diff --git a/recipes/icons/pc_centre_pl.png b/recipes/icons/pc_centre_pl.png new file mode 100644 index 0000000000..e2fbf1eefb Binary files /dev/null and b/recipes/icons/pc_centre_pl.png differ diff --git a/recipes/icons/pc_foster.png b/recipes/icons/pc_foster.png new file mode 100644 index 0000000000..433970bcc1 Binary files /dev/null and b/recipes/icons/pc_foster.png differ diff --git a/recipes/icons/polska_times.png b/recipes/icons/polska_times.png new file mode 100644 index 0000000000..f233f45518 Binary files /dev/null and b/recipes/icons/polska_times.png differ diff --git a/recipes/icons/pure_pc.png b/recipes/icons/pure_pc.png new file mode 100644 index 0000000000..e5e102eee7 Binary files /dev/null and b/recipes/icons/pure_pc.png differ diff --git a/recipes/icons/racjonalista_pl.png b/recipes/icons/racjonalista_pl.png new file mode 100644 index 0000000000..8f4d3c6c81 Binary files /dev/null and b/recipes/icons/racjonalista_pl.png differ diff --git a/recipes/icons/rue89.png b/recipes/icons/rue89.png new file mode 100644 index 0000000000..55c52bc488 Binary files /dev/null and b/recipes/icons/rue89.png differ diff --git a/recipes/icons/tanuki.png b/recipes/icons/tanuki.png new file mode 100644 index 0000000000..fe46d7e8dc Binary files /dev/null and b/recipes/icons/tanuki.png differ diff --git a/recipes/icons/tvn24.png b/recipes/icons/tvn24.png new file mode 100644 index 0000000000..864a6624ac Binary files /dev/null and b/recipes/icons/tvn24.png differ diff --git a/recipes/icons/webhosting_pl.png b/recipes/icons/webhosting_pl.png new file mode 100644 index 0000000000..0e11a3065e Binary files /dev/null and b/recipes/icons/webhosting_pl.png differ diff --git a/recipes/idg_se.recipe b/recipes/idg_se.recipe index e5f0203e09..155c6647d3 100644 --- a/recipes/idg_se.recipe +++ b/recipes/idg_se.recipe @@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class IDGse(BasicNewsRecipe): title = 'IDG' - __author__ = 'zapt0' + __author__ = 'Stanislav Khromov' language = 'sv' description = 'IDG.se' oldest_article = 1 @@ -15,6 +15,9 @@ class IDGse(BasicNewsRecipe): feeds = [(u'Dagens IDG-nyheter',u'http://feeds.idg.se/idg/ETkj?format=xml')] + def get_article_url(self, article): + return article.get('guid', None) + def print_version(self,url): return url + '?articleRenderMode=print&m=print' diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe new file mode 100644 index 0000000000..16ad622b46 --- /dev/null +++ b/recipes/in4_pl.recipe @@ -0,0 +1,44 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class in4(BasicNewsRecipe): + title = u'IN4.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Serwis Informacyjny - Aktualnosci, recenzje' + category = 'IT' + language = 'pl' + #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:.*?

', re.DOTALL), lambda match: '')] + keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})] + remove_tags_after= dict(attrs={'class':'tags'}) + remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})] + feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')] diff --git a/recipes/instapaper.recipe b/recipes/instapaper.recipe index d182e556a2..40992e4d75 100644 --- a/recipes/instapaper.recipe +++ b/recipes/instapaper.recipe @@ -1,8 +1,9 @@ +#v2 2011-07-25 from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1299694372(BasicNewsRecipe): title = u'Instapaper' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic, Stanislav Khromov' publisher = 'Instapaper.com' category = 'info, custom, Instapaper' oldest_article = 365 @@ -15,6 +16,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): ,dict(name='div', attrs={'id':'text_controls'}) ,dict(name='div', attrs={'id':'editing_controls'}) ,dict(name='div', attrs={'class':'bar bottom'}) + ,dict(name='div', attrs={'id':'controlbar_container'}) + ,dict(name='div', attrs={'id':'footer'}) ] use_embedded_content = False needs_subscription = True diff --git a/recipes/japaa.recipe b/recipes/japaa.recipe new file mode 100644 index 0000000000..93fd3e0cce --- /dev/null +++ b/recipes/japaa.recipe @@ -0,0 +1,99 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1330393641(BasicNewsRecipe): + title = u'JAAPA' + __author__ = 'adoucette' + language = 'en' + oldest_article = 30 + max_articles_per_feed = 100 + auto_cleanup = True + + def get_cover_url(self): + cover_url = None + soup = self.index_to_soup('http://www.jaapa.com') + cover_item = soup.find('img', src=re.compile(r'\w*?cover\w{1,22}\.jpg')) + if cover_item: + cover_url = cover_item['src'] + return cover_url + + feeds = [ + (u'CME Articles', + u'http://feeds.feedburner.com/jaapacmearticles'), + (u'A Day in the Life', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=490'), + (u'Ask A Librarian', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=847'), + (u'Case of the Month', + u'http://feeds.feedburner.com/jaapacaseofthemonth'), + (u'Clinical Watch', + u'http://feeds.feedburner.com/jaapaclinicalwatch'), + (u'Commentary', + u'http://feeds.feedburner.com/jaapacommentary'), + (u'Critically Appraised Topic', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=699'), + (u'Dermatology Digest', + u'http://feeds.feedburner.com/jaapadermatologydigest'), + (u'Diagnostic Imaging Review', + u'http://feeds.feedburner.com/jaapadiagnosticimagingreview'), + (u'Editorial', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=759'), + (u'From the Academy', + u'http://feeds.feedburner.com/jaapafromtheacademy'), + (u'Genomics in PA Practice', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=760'), + (u'Humane Medicine', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=758'), + (u'Inside the AAPA Policy Manual', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1546'), + (u'Interpreting ECGs', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1624'), + (u'Letters', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=808'), + (u'PA Quandaries', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=496'), + (u'Pharmacology Consult', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1614'), + (u'POEMs', u'http://feeds.feedburner.com/jaapapoems'), + (u'Quick Recertification', + u'http://feeds.feedburner.com/jaapaquickrecertificationseries'), + (u'Sounding Board', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=698'), + (u'The Surgical Patient', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=499'), + (u'Topics in Infectious Diseases', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2495'), + (u"What's New", u'http://feeds.feedburner.com/jaapawhatsnew'), + (u'When the Patient Asks', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=501'), + (u"Women's Health", + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2176'), + (u'AAPA Special Article', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1453'), + (u'Case Reports', + u'http://feeds.feedburner.com/jaapacasereports'), + (u'Review Articles', + u'http://feeds.feedburner.com/jaapareviewarticles'), + (u'Surgical Reviews', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=505'), + (u'Brief Report', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2353'), + (u'Research Corner', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=498'), + (u'Research Reports', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1024'), + (u'The Art of Medicine', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1289'), + (u'Clinical Practice Guidelines', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2102'), + (u'Complementary and Alternative Medicine', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2123'), + (u'Drug Information', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2089'), + (u'Evidence-Based Medicine', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=1288'), + (u'Patient Information', + u'http://www.jaapa.com/pages/rss.aspx?sectionid=2122')] + + def print_version(self, url): + return url.replace('/article/', '/printarticle/') diff --git a/recipes/kresy_pl.recipe b/recipes/kresy_pl.recipe new file mode 100644 index 0000000000..3dfc2c057c --- /dev/null +++ b/recipes/kresy_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Kresy(BasicNewsRecipe): + title = u'Kresy' + __author__ = 'fenuks' + description = u'portal społeczności kresowej' + language = 'pl' + masthead_url= 'http://www.kresy.pl/public/img/logo.png' + cover_url= 'http://www.kresy.pl/public/img/logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + keep_only_tags= [dict(id='artykul')] + remove_tags= [dict(attrs={'class':['twitter-share-button', 'likefbborder', 'tagi']})] + feeds = [(u'Wszystkie', u'http://www.kresy.pl/rss')] diff --git a/recipes/la_jornada.recipe b/recipes/la_jornada.recipe index 71c526a0a0..74565ab179 100644 --- a/recipes/la_jornada.recipe +++ b/recipes/la_jornada.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic , Rogelio Domínguez ' +__copyright__ = '2010-2012, Darko Miletic , Rogelio Domínguez ' ''' www.jornada.unam.mx ''' @@ -86,6 +86,6 @@ class LaJornada_mx(BasicNewsRecipe): return soup def get_article_url(self, article): - rurl = article.get('link', None) + rurl = article.get('guid', None) return rurl.rpartition('&partner=')[0] diff --git a/recipes/la_pausa_caffe.recipe b/recipes/la_pausa_caffe.recipe new file mode 100644 index 0000000000..1a87d33dcf --- /dev/null +++ b/recipes/la_pausa_caffe.recipe @@ -0,0 +1,17 @@ +__version__ = 'v1.0' +__date__ = '13, February 2011' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1329125921(BasicNewsRecipe): + title = u'La pausa caff\xe8' + __author__ = 'faber1971' + description = 'An Italian satirical blog' + language = 'it' + + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + no_stylesheets = True + feeds = [(u'La pausa caff\xe8', u'http://feeds.feedburner.com/LapausaCaffe')] + diff --git a/recipes/marketing_magazine.recipe b/recipes/marketing_magazine.recipe index 55b6ea2584..0c14939cd8 100644 --- a/recipes/marketing_magazine.recipe +++ b/recipes/marketing_magazine.recipe @@ -1,4 +1,5 @@ __license__ = 'GPL v3' + from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1327062445(BasicNewsRecipe): @@ -7,10 +8,13 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe): max_articles_per_feed = 100 auto_cleanup = True remove_javascript = True + no_stylesheets = True + remove_tags = [ + dict(name='ul', attrs={'id':'ads0'}) + ] masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg' - feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] __author__ = 'faber1971' - description = 'Collection of Italian marketing websites - v1.00 (28, January 2012)' + description = 'Collection of Italian marketing websites - v1.03 (20, February 2012)' language = 'it' - + feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')] diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 4540879f72..f84fb5bc7e 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -1,69 +1,45 @@ __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010, Louis Gesbert ' +__copyright__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' ''' Mediapart ''' -from calibre.ebooks.BeautifulSoup import Tag +__author__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' + +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Mathieu Godlewski' - description = 'Global news in french from online newspapers' + __author__ = 'Mathieu Godlewski, Louis Gesbert' + description = 'Global news in french from news site Mediapart' oldest_article = 7 language = 'fr' needs_subscription = True - max_articles_per_feed = 50 + + use_embedded_content = False no_stylesheets = True - cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg' + cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' feeds = [ ('Les articles', 'http://www.mediapart.fr/articles/feed'), ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# -# preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in -# [ -# (r'', lambda match : '

'+match.group(1)+'

'), -# (r'[^>]+]*>([^<]*)[^<]*', -# lambda match : ''+match.group(1)+''), -# (r'\'', lambda match: '’'), -# ] -# ] -# -# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), -# dict(name='div', attrs={'class':'print-links'}), -# dict(name='img', attrs={'src':'entete_article.png'}), -# dict(name='br') ] -# -# def print_version(self, url): -# raw = self.browser.open(url).read() -# soup = BeautifulSoup(raw.decode('utf8', 'replace')) -# div = soup.find('div', {'id':re.compile('node-\d+')}) -# if div is None: -# return None -# article_id = string.replace(div['id'], 'node-', '') -# if article_id is None: -# return None -# return 'http://www.mediapart.fr/print/'+article_id +# -- print-version -# -- Non-print version [dict(name='div', attrs={'class':'advert'})] + conversion_options = { 'smarten_punctuation' : True } - keep_only_tags = [ - dict(name='h1', attrs={'class':'title'}), - dict(name='div', attrs={'class':'page_papier_detail'}), - ] + remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] - def preprocess_html(self,soup): - for title in soup.findAll('div', {'class':'titre'}): - tag = Tag(soup, 'h3') - title.replaceWith(tag) - tag.insert(0,title) - return soup + def print_version(self, url): + raw = self.browser.open(url).read() + soup = BeautifulSoup(raw.decode('utf8', 'replace')) + link = soup.find('a', {'title':'Imprimer'}) + if link is None: + return None + return link['href'] # -- Handle login @@ -77,3 +53,10 @@ class Mediapart(BasicNewsRecipe): br.submit() return br + def preprocess_html(self, soup): + for title in soup.findAll('p', {'class':'titre_page'}): + title.name = 'h3' + for legend in soup.findAll('span', {'class':'legend'}): + legend.insert(0, Tag(soup, 'br', [])) + legend.name = 'small' + return soup diff --git a/recipes/microwave_and_rf.recipe b/recipes/microwave_and_rf.recipe index e3eee9dab1..3cdf6e5acc 100644 --- a/recipes/microwave_and_rf.recipe +++ b/recipes/microwave_and_rf.recipe @@ -15,7 +15,7 @@ import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.utils.magick import Image -class Microwave_and_RF(BasicNewsRecipe): +class Microwaves_and_RF(BasicNewsRecipe): Convert_Grayscale = False # Convert images to gray scale or not @@ -25,9 +25,9 @@ class Microwave_and_RF(BasicNewsRecipe): # Add sections that want to be included from the magazine include_sections = [] - title = u'Microwave and RF' - __author__ = 'kiavash' - description = u'Microwave and RF Montly Magazine' + title = u'Microwaves and RF' + __author__ = u'kiavash' + description = u'Microwaves and RF Montly Magazine' publisher = 'Penton Media, Inc.' publication_type = 'magazine' site = 'http://mwrf.com' @@ -96,9 +96,16 @@ class Microwave_and_RF(BasicNewsRecipe): def parse_index(self): - # Fetches the main page of Microwave and RF + # Fetches the main page of Microwaves and RF soup = self.index_to_soup(self.site) + # First page has the ad, Let's find the redirect address. + url = soup.find('span', attrs={'class':'commonCopy'}).find('a').get('href') + if url.startswith('/'): + url = self.site + url + + soup = self.index_to_soup(url) + # Searches the site for Issue ID link then returns the href address # pointing to the latest issue latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href') diff --git a/recipes/naczytniki.recipe b/recipes/naczytniki.recipe index e4769d58bc..2ae6bc391e 100644 --- a/recipes/naczytniki.recipe +++ b/recipes/naczytniki.recipe @@ -1,8 +1,9 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class naczytniki(BasicNewsRecipe): title = u'naczytniki.pl' __author__ = 'fenuks' + masthead_url= 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png' language = 'pl' description ='everything about e-readers' @@ -10,6 +11,7 @@ class naczytniki(BasicNewsRecipe): no_stylesheets=True oldest_article = 7 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'


Zobacz także:

.*?', re.DOTALL), lambda match: '') ] remove_tags_after= dict(name='div', attrs={'class':'sociable'}) keep_only_tags=[dict(name='div', attrs={'class':'post'})] remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})] diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index 74534f3346..ec556da5fa 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -1,21 +1,33 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe +import re + class Nowa_Fantastyka(BasicNewsRecipe): title = u'Nowa Fantastyka' oldest_article = 7 __author__ = 'fenuks' + __modified_by__ = 'zaslav' language = 'pl' encoding='latin2' description ='site for fantasy readers' category='fantasy' + masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg' + #extra_css='.tytul {font-size: 20px;}' #not working max_articles_per_feed = 100 INDEX='http://www.fantastyka.pl/' no_stylesheets=True needs_subscription = 'optional' - remove_tags_before=dict(attrs={'class':'belka1-tlo-md'}) + remove_tags_before=dict(attrs={'class':'naglowek2'}) #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'}) - remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'}) - remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})] + remove_tags_after=dict(name='form', attrs={'name':'form1'}) + remove_tags=[dict(attrs={'class':['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')] + preprocess_regexps = [ + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + + + def find_articles(self, url): articles = [] @@ -41,10 +53,10 @@ class Nowa_Fantastyka(BasicNewsRecipe): return feeds + def get_cover_url(self): - soup = self.index_to_soup('http://www.fantastyka.pl/1.html') - cover=soup.find(name='img', attrs={'class':'okladka'}) - self.cover_url=self.INDEX+ cover['src'] + soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka') + self.cover_url='http://www.e-kiosk.pl' + soup.find(name='a', attrs={'class':'img'})['href'] return getattr(self, 'cover_url', self.cover_url) def get_browser(self): @@ -56,3 +68,18 @@ class Nowa_Fantastyka(BasicNewsRecipe): br['pass'] = self.password br.submit() return br + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(font=True): + del item['font'] + for item in soup.findAll(align=True): + del item['align'] + for item in soup.findAll(name='tr'): + item.name='div' + title=soup.find(attrs={'class':'tytul'}) + if title: + title['style']='font-size: 20px; font-weight: bold;' + self.log.warn(soup) + return soup diff --git a/recipes/oclab_pl.recipe b/recipes/oclab_pl.recipe new file mode 100644 index 0000000000..b0df89ba72 --- /dev/null +++ b/recipes/oclab_pl.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OCLab(BasicNewsRecipe): + title = u'OCLab.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal OCLab.pl jest miejscem przyjaznym pasjonatom sprzętu komputerowego, w szczególności overclockerom, które będzie służyć im za aktualną bazę wiedzy o podkręcaniu komputera, źródło aktualnych informacji z rynku oraz opinii na temat sprzętu komputerowego.' + category = 'IT' + language = 'pl' + cover_url= 'http://www.idealforum.ru/attachment.php?attachmentid=7963&d=1316008118' + no_stylesheets = True + keep_only_tags=[dict(id='main')] + remove_tags_after= dict(attrs={'class':'single-postmetadata'}) + remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})] + feeds = [(u'Wpisy', u'http://oclab.pl/feed/')] + + + def append_page(self, soup, appendtag): + tag=soup.find(attrs={'class':'contentjumpddl'}) + if tag: + nexturl=tag.findAll('option') + for nextpage in nexturl[1:-1]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(attrs={'class':'single-entry'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}): + r.extract() + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe new file mode 100644 index 0000000000..d7f4c8093d --- /dev/null +++ b/recipes/overclock_pl.recipe @@ -0,0 +1,37 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class Overclock_pl(BasicNewsRecipe): + title = u'Overclock.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal poświęcony tematyce hardware, kładący największy nacisk na podkręcanie / overclocking (włącznie z extreme) i chłodzenie / cooling (air cooling, water cooling, freon cooling, dry ice, liquid nitrogen).' + category = 'IT' + language = 'pl' + masthead_url='http://www.overclock.pl/gfx/logo_m.png' + cover_url='http://www.overclock.pl/gfx/logo_m.png' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'Komentarze do aktualności:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Nawigacja

', re.DOTALL), lambda match: '') ] + keep_only_tags=[dict(name='div', attrs={'class':'news'}), dict(id='articleContent')] + remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})] + feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')] + + + def append_page(self, soup, appendtag): + tag=soup.find(id='navigation') + if tag: + nexturl=tag.findAll('option') + tag.extract() + for nextpage in nexturl[2:]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(id='content') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(attrs={'alt':'Pierwsza'}) + if rem: + rem.parent.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/palmtop_pl.recipe b/recipes/palmtop_pl.recipe new file mode 100644 index 0000000000..ace772e7e7 --- /dev/null +++ b/recipes/palmtop_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class palmtop_pl(BasicNewsRecipe): + title = u'Palmtop.pl' + __author__ = 'fenuks' + description = 'wortal technologii mobilnych' + category = 'mobile' + language = 'pl' + cover_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + masthead_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + + feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')] diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe new file mode 100644 index 0000000000..faefeb25c0 --- /dev/null +++ b/recipes/pc_arena.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Arena(BasicNewsRecipe): + title = u'PCArena' + oldest_article = 18300 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' + category = 'IT' + language = 'pl' + masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif' + cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif' + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] + remove_tags=[dict(attrs={'class':'pages'})] + feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pagNum'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[1:]: + nextpage= 'http://pcarena.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(attrs={'class':'artBody'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe new file mode 100644 index 0000000000..68a17888ce --- /dev/null +++ b/recipes/pc_centre_pl.recipe @@ -0,0 +1,41 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Centre(BasicNewsRecipe): + title = u'PC Centre' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.' + category = 'IT' + language = 'pl' + masthead_url= 'http://pccentre.pl/views/images/logo.gif' + cover_url= 'http://pccentre.pl/views/images/logo.gif' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] + feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] + + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pages'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[:-1]: + nextpage= 'http://pccentre.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(id='content') + rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']}) + for r in rem: + r.extract() + rem=pagetext.findAll(id='comments') + for r in rem: + r.extract() + rem=pagetext.findAll('h1') + for r in rem: + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_foster.recipe b/recipes/pc_foster.recipe new file mode 100644 index 0000000000..ab8c2b66b1 --- /dev/null +++ b/recipes/pc_foster.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Foster(BasicNewsRecipe): + title = u'PC Foster' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.' + category = 'IT' + language = 'pl' + masthead_url='http://pcfoster.pl/public/images/logo.png' + cover_url= 'http://pcfoster.pl/public/images/logo.png' + no_stylesheets= True + remove_empty_feeds= True + keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})] + remove_tags=[dict(name='p', attrs={'class':'right'})] + feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'alt':u'Następna strona'}) + if nexturl: + appendtag.find(attrs={'class':'pager more_top'}).extract() + while nexturl: + nexturl='http://pcfoster.pl' + nexturl.parent['href'] + soup2 = self.index_to_soup(nexturl) + nexturl=soup2.find(attrs={'alt':u'Następna strona'}) + pagetext = soup2.find(attrs={'class':'content'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'review_content double'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/polska_times.recipe b/recipes/polska_times.recipe new file mode 100644 index 0000000000..4126576fe2 --- /dev/null +++ b/recipes/polska_times.recipe @@ -0,0 +1,81 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Polska_times(BasicNewsRecipe): + title = u'Polska Times' + __author__ = 'fenuks' + description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.' + category = 'newspaper' + language = 'pl' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17' + oldest_article = 7 + max_articles_per_feed = 100 + remove_emty_feeds= True + no_stylesheets = True + preprocess_regexps = [(re.compile(ur'Czytaj także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur',Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TEŻ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ WIĘCEJ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TAKŻE:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: ''), (re.compile(ur'Nasze serwisy:.*', re.DOTALL), lambda match: '') ] + keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])] + remove_tags_after= dict(id='material-tagi') + remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})] + feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')] + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def append_page(self, soup, appendtag): + nexturl=soup.find(id='nastepna_strona') + while nexturl: + soup2= self.index_to_soup(nexturl['href']) + nexturl=soup2.find(id='nastepna_strona') + pagetext = soup2.find(id='tresc') + for dictionary in self.remove_tags: + v=pagetext.findAll(attrs=dictionary['attrs']) + for delete in v: + delete.extract() + for b in pagetext.findAll(name='b'): + if b.string: + if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string: + b.extract() + for center in pagetext.findAll(name='center'): + if center.h4: + if center.h4.a: + center.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def image_article(self, soup, appendtag): + nexturl=soup.find('a', attrs={'class':'nastepna'}) + urls=[] + while nexturl: + if nexturl not in urls: + urls.append(nexturl) + else: + break + soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href']) + nexturl=soup2.find('a', attrs={'class':'nastepna'}) + if nexturl in urls: + break; + pagetext = soup2.find(id='galeria-material') + pos = len(appendtag.contents) + appendtag.insert(pos, '
') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}): + rem.extract() + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def preprocess_html(self, soup): + if soup.find('a', attrs={'class':'nastepna'}): + self.image_article(soup, soup.body) + elif soup.find(id='nastepna_strona'): + self.append_page(soup, soup.body) + return soup + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe new file mode 100644 index 0000000000..7a6c43bb7e --- /dev/null +++ b/recipes/pure_pc.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PurePC(BasicNewsRecipe): + title = u'PurePC' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.' + category = 'IT' + language = 'pl' + masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags_after= dict(attrs={'class':'fivestar-widget'}) + remove_tags= [dict(id='navigator'), dict(attrs={'class':['box-tools', 'fivestar-widget', 'PageMenuList']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'pager-next'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href']) + nexturl=soup2.find(attrs={'class':'pager-next'}) + pagetext = soup2.find(attrs={'class':'article'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/racjonalista_pl.recipe b/recipes/racjonalista_pl.recipe new file mode 100644 index 0000000000..d803f22a7b --- /dev/null +++ b/recipes/racjonalista_pl.recipe @@ -0,0 +1,54 @@ +__copyright__ = '2012, Micha\u0142 ' +''' +Racjonalista.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Racjonalista(BasicNewsRecipe): + __author__ = u'Micha\u0142 ' + publisher = u'Fundacja Wolnej My\u015bli' + title = u'Racjonalista.pl' + description = u'Racjonalista.pl' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + oldest_article = 7 + max_articles_per_feed = 20 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + simultaneous_downloads = 2 + timeout = 30 + cover_url = 'http://www.racjonalista.pl/img/uimg/rac.gif' + + feeds = [(u'Racjonalista.pl', u'http://www.racjonalista.pl/rss.php')] + + match_regexps = [r'kk\.php'] + + def print_version(self, url): + return url.replace('/s,', '/t,') + + extra_css = 'h2 {font: serif large} .cytat {text-align: right}' + + remove_attributes = ['target', 'width', 'height'] + + preprocess_regexps = [ + (re.compile(i[0], re.DOTALL), i[1]) for i in + [ (r']*> 

', lambda match: ''), + (r' ', lambda match: ' '), + (r']+>', lambda match: ''), + (r']+>', lambda match: ''), + (r'', lambda match: ''), + (r'[^<]+)', lambda match: '' + match.group('a') + ''), + (r'
(?P[^<]+)
', lambda match: '

' + match.group('t') + '

'), + (r'
', lambda match: ''), + (r'
', lambda match: ''), + (r'
', lambda match: ''), + (r']+>(?P

[^<]+)', lambda match: '' + match.group('p') + ''), + (r']+>(?P[^<]+)', lambda match: match.group('a')), + (r'Orygin[^<]+', lambda match: ''), + (r'Poka[^<]+', lambda match: '')] + ] + diff --git a/recipes/rue89.recipe b/recipes/rue89.recipe index 51cf8f6b98..bd3ef7ea4c 100644 --- a/recipes/rue89.recipe +++ b/recipes/rue89.recipe @@ -1,13 +1,11 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Louis Gesbert ' +__copyright__ = '2010-2012, Louis Gesbert ' ''' Rue89 ''' -__author__ = '2010, Louis Gesbert ' +__author__ = '2010-2012, Louis Gesbert ' -import re -from calibre.ebooks.BeautifulSoup import Tag from calibre.web.feeds.news import BasicNewsRecipe class Rue89(BasicNewsRecipe): @@ -17,37 +15,45 @@ class Rue89(BasicNewsRecipe): title = u'Rue89' language = 'fr' oldest_article = 7 - max_articles_per_feed = 50 + max_articles_per_feed = 12 - feeds = [(u'La Une', u'http://www.rue89.com/homepage/feed')] + use_embedded_content = False + + # From http://www.rue89.com/les-flux-rss-de-rue89 + feeds = [ + (u'La Une', u'http://www.rue89.com/feed'), + (u'Rue69', u'http://www.rue89.com/rue69/feed'), + (u'Eco', u'http://www.rue89.com/rue89-eco/feed'), + (u'Planète', u'http://www.rue89.com/rue89-planete/feed'), + (u'Sport', u'http://www.rue89.com/rue89-sport/feed'), + (u'Culture', u'http://www.rue89.com/culture/feed'), + (u'Hi-tech', u'http://www.rue89.com/hi-tech/feed'), + (u'Media', u'http://www.rue89.com/medias/feed'), + (u'Monde', u'http://www.rue89.com/monde/feed'), + (u'Politique', u'http://www.rue89.com/politique/feed'), + (u'Societe', u'http://www.rue89.com/societe/feed'), + ] + + # Follow redirection from feedsportal.com + def get_article_url(self,article): + return self.browser.open_novisit(article.link).geturl() + + def print_version(self, url): + return url + '?imprimer=1' no_stylesheets = True - preprocess_regexps = [ - (re.compile(r'<(/?)h2>', re.IGNORECASE|re.DOTALL), - lambda match : '<'+match.group(1)+'h3>'), - (re.compile(r'

', re.IGNORECASE|re.DOTALL), - lambda match : '

'+match.group(1)+'

'), - (re.compile(r']+src="[^"]*/numeros/(\d+)[^0-9.">]*.gif"[^>]*/>', re.IGNORECASE|re.DOTALL), - lambda match : ''+match.group(1)+''), - (re.compile(r'\''), lambda match: '’'), - ] + conversion_options = { 'smarten_punctuation' : True } - def preprocess_html(self,soup): - body = Tag(soup, 'body') - title = soup.find('h1', {'class':'title'}) - content = soup.find('div', {'class':'content'}) - soup.body.replaceWith(body) - body.insert(0, title) - body.insert(1, content) - return soup + keep_only_tags = [ + dict(name='div', attrs={'id':'article'}), + ] - remove_tags = [ #dict(name='div', attrs={'class':'print-source_url'}), - #dict(name='div', attrs={'class':'print-links'}), - #dict(name='img', attrs={'class':'print-logo'}), - dict(name='div', attrs={'class':'content_top'}), - dict(name='div', attrs={'id':'sidebar-left'}), ] + remove_tags_after = [ + dict(name='div', attrs={'id':'plus_loin'}), + ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# def print_version(self, url): -# return re.sub('^.*-([0-9]+)$', 'http://www.rue89.com/print/\\1',url) + remove_tags = [ + dict(name='div', attrs={'id':'article_tools'}), + dict(name='div', attrs={'id':'plus_loin'}), + ] diff --git a/recipes/satmagazine.recipe b/recipes/satmagazine.recipe new file mode 100644 index 0000000000..3e4b1e1b19 --- /dev/null +++ b/recipes/satmagazine.recipe @@ -0,0 +1,155 @@ +#!/usr/bin/env python +## +## Title: SatMagazine +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +## +## Written: Feb 2012 +## Last Edited: Mar 2012 +## + +# Feb 2012: Initial release + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' + +''' +satmagazine.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class SatMagazine(BasicNewsRecipe): + + title = u'SatMagazine' + description = u'North American Satellite Markets...' + publisher = 'Satnews Publishers' + publication_type = 'magazine' + INDEX = 'http://www.satmagazine.com/cgi-bin/display_edition.cgi' + __author__ = 'kiavash' + + language = 'en' + asciiize = True + timeout = 120 + simultaneous_downloads = 2 + + # Flattens all the tables to make it compatible with Nook + conversion_options = {'linearize_tables' : True} + + keep_only_tags = [dict(name='span', attrs={'class':'story'})] + + no_stylesheets = True + remove_javascript = True + + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] + + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { font-size: 175%; font-weight: bold; } \ + h2 { font-size: 150%; font-weight: bold; } \ + h3 { font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { font-size: 100%; font-weight: bold; }' + + # Remove the line breaks, href links and float left/right and picture width/height. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r''), lambda h1: ''), + (re.compile(r''), lambda h2: ''), + (re.compile(r'float:.*?'), lambda h3: ''), + (re.compile(r'width:.*?px'), lambda h4: ''), + (re.compile(r'height:.*?px'), lambda h5: '') + ] + + def parse_index(self): + + article_info = [] + feeds = [] + + soup = self.index_to_soup(self.INDEX) + + # Find Cover image + cover = soup.find('img', src=True, alt='Cover Image') + if cover is not None: + self.cover_url = cover['src'] + self.log('Found Cover image:', self.cover_url) + + soup = soup.find('div', attrs={'id':'middlecontent'}) # main part of the site that has the articles + + #Find the Magazine date + ts = soup.find('span', attrs={'class':'master_heading'}) # contains the string with the date + ds = ' '.join(self.tag_to_string(ts).strip().split()[:2]) + self.log('Found Current Issue:', ds) + self.timefmt = ' [%s]'%ds + + #sections = soup.findAll('span', attrs={'class':'upper_heading'}) + + articles = soup.findAll('span', attrs={'class':'heading'}) + + descriptions = soup.findAll('span', attrs={'class':'story'}) + + title_number = 0 + + # Goes thru all the articles one by one and sort them out + for article in articles: + + title = self.tag_to_string(article) + url = article.find('a').get('href') + + self.log('\tFound article:', title, 'at', url) + desc = self.tag_to_string(descriptions[title_number]) + #self.log('\t\t', desc) + + article_info.append({'title':title, 'url':url, 'description':desc, + 'date':self.timefmt}) + + title_number = title_number + 1 + + if article_info: + feeds.append((self.title, article_info)) + + return feeds + + def preprocess_html(self, soup): + + # Finds all the images + for figure in soup.findAll('img', attrs = {'src' : True}): + + # if the image is an ad then remove it. + if (figure['alt'].find('_ad_') >=0) or (figure['alt'].find('_snipe_') >=0): + del figure['src'] + del figure['alt'] + del figure['border'] + del figure['hspace'] + del figure['vspace'] + del figure['align'] + del figure['size'] + figure.name = 'font' + continue + + figure['style'] = 'display:block' # adds /n before and after the image + + # Makes the title standing out + for title in soup.findAll('b'): + title.name = 'h3' + + # Removes all unrelated links + for link in soup.findAll('a', attrs = {'href': True}): + link.name = 'font' + del link['href'] + del link['target'] + + return soup diff --git a/recipes/sueddeutsche.recipe b/recipes/sueddeutsche.recipe index 4e683ef0a9..624321e730 100644 --- a/recipes/sueddeutsche.recipe +++ b/recipes/sueddeutsche.recipe @@ -11,7 +11,7 @@ class Sueddeutsche(BasicNewsRecipe): title = u'Süddeutsche.de' # 2012-01-26 AGe Correct Title description = 'News from Germany, Access to online content' # 2012-01-26 AGe __author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-01-26 - publisher = 'Süddeutsche Zeitung' # 2012-01-26 AGe add + publisher = u'Süddeutsche Zeitung' # 2012-01-26 AGe add category = 'news, politics, Germany' # 2012-01-26 AGe add timefmt = ' [%a, %d %b %Y]' # 2012-01-26 AGe add %a oldest_article = 7 diff --git a/recipes/sueddeutschezeitung.recipe b/recipes/sueddeutschezeitung.recipe index 3185fc0f8e..f38f80dd45 100644 --- a/recipes/sueddeutschezeitung.recipe +++ b/recipes/sueddeutschezeitung.recipe @@ -9,10 +9,10 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre import strftime class SueddeutcheZeitung(BasicNewsRecipe): - title = 'Süddeutsche Zeitung' + title = u'Süddeutsche Zeitung' __author__ = 'Darko Miletic' description = 'News from Germany. Access to paid content.' - publisher = 'Süddeutsche Zeitung' + publisher = u'Süddeutsche Zeitung' category = 'news, politics, Germany' no_stylesheets = True oldest_article = 2 diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index d06e32d9af..f4c1efa9b8 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -1,14 +1,16 @@ from calibre.web.feeds.news import BasicNewsRecipe - +import re class Tablety_pl(BasicNewsRecipe): title = u'Tablety.pl' __author__ = 'fenuks' description = u'tablety.pl - latest tablet news' + masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'}) remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})] diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe new file mode 100644 index 0000000000..666cb8aa77 --- /dev/null +++ b/recipes/tanuki.recipe @@ -0,0 +1,37 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class tanuki(BasicNewsRecipe): + title = u'Tanuki' + oldest_article = 7 + __author__ = 'fenuks' + category = 'anime, manga' + language = 'pl' + max_articles_per_feed = 100 + encoding='utf-8' + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}' + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'', re.DOTALL), lambda match: '')] + remove_empty_feeds= True + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['animename', 'storyname', 'nextarrow','sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={'summary':'Technikalia'}), dict(attrs={'class':['chaptername','copycat']}), dict(id='rightcolumn'), dict(attrs={'class':['headn_tt', 'subtable']})] + remove_tags=[dict(name='div', attrs={'class':'screen'}), dict(id='randomtoplist'), dict(attrs={'class':'note'})] + feeds = [(u'Anime', u'http://anime.tanuki.pl/rss_anime.xml'), (u'Manga', u'http://manga.tanuki.pl/rss_manga.xml'), (u'Tomiki', u'http://manga.tanuki.pl/rss_mangabooks.xml'), (u'Artyku\u0142y', u'http://czytelnia.tanuki.pl/rss_czytelnia_artykuly.xml'), (u'Opowiadania', u'http://czytelnia.tanuki.pl/rss_czytelnia.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'nextarrow'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://czytelnia.tanuki.pl'+ nexturl['href']) + nexturl=soup2.find(attrs={'class':'nextarrow'}) + pagetext = soup2.find(attrs={'class':['chaptername', 'copycat']}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'copycat'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'nextarrow'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index 5699ec106c..80b37f329a 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,49 +1,57 @@ import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag +from calibre.web.feeds.recipes import BasicNewsRecipe -class AdvancedUserRecipe1268409464(BasicNewsRecipe): - title = u'The Sun' - __author__ = 'Chaz Ralph' - description = 'News from The Sun' +class AdvancedUserRecipe1325006965(BasicNewsRecipe): + + title = u'The Sun UK' + cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png' + + description = 'A Recipe for The Sun tabloid UK - uses feed43' + __author__ = 'Dave Asbury' + # last updated 20/2/12 + language = 'en_GB' oldest_article = 1 - max_articles_per_feed = 100 - language = 'en' + max_articles_per_feed = 15 + remove_empty_feeds = True no_stylesheets = True - extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' - encoding= 'iso-8859-1' - remove_javascript = True + + masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif' + encoding = 'cp1251' + + encoding = 'cp1252' + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + extra_css = ''' + body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + ''' + + preprocess_regexps = [ + (re.compile(r'