From 0906058fb58e24ae7c0f077600de720fa692cf1b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 24 Jan 2011 14:06:23 -0700 Subject: [PATCH 1/7] Various Czech news source by FunThomas. Fixes #8562 (New Czech recipes) --- resources/recipes/abc.recipe | 43 +++++++++++++++++++++++++++ resources/recipes/idnes.recipe | 54 ++++++++++++++++++++++++++++++++++ resources/recipes/root.recipe | 39 ++++++++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 resources/recipes/abc.recipe create mode 100644 resources/recipes/idnes.recipe create mode 100644 resources/recipes/root.recipe diff --git a/resources/recipes/abc.recipe b/resources/recipes/abc.recipe new file mode 100644 index 0000000000..c4ae0aa308 --- /dev/null +++ b/resources/recipes/abc.recipe @@ -0,0 +1,43 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ABCRecipe(BasicNewsRecipe): + title = u'ABC Linuxu' + oldest_article = 5 + max_articles_per_feed = 3#5 + __author__ = 'Funthomas' + language = 'cs' + + feeds = [ + #(u'Blogy', u'http://www.abclinuxu.cz/auto/blogDigest.rss'), + (u'Články', u'http://www.abclinuxu.cz/auto/abc.rss'), + (u'Zprávičky','http://www.abclinuxu.cz/auto/zpravicky.rss') + ] + + remove_javascript = True + no_stylesheets = True + remove_attributes = ['width','height'] + + remove_tags_before = dict(name='h1') + remove_tags = [ + dict(attrs={'class':['meta-vypis','page_tools','cl_perex']}), + dict(attrs={'class':['cl_nadpis-link','komix-nav']}) + ] + + remove_tags_after = [ + dict(name='div',attrs={'class':['cl_perex','komix-nav']}), + dict(attrs={'class':['meta-vypis','page_tools']}), + dict(name='',attrs={'':''}), + ] + + + preprocess_regexps = [ + (re.compile(r'.*

', re.DOTALL),lambda match: '

') + ] + def print_version(self, url): + return url + '?varianta=print&noDiz' + + extra_css = ''' + h1 {font-size:130%; font-weight:bold} + h3 {font-size:111%; font-weight:bold} + ''' diff --git a/resources/recipes/idnes.recipe b/resources/recipes/idnes.recipe new file mode 100644 index 0000000000..0bd4de2327 --- /dev/null +++ b/resources/recipes/idnes.recipe @@ -0,0 +1,54 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +class iHeuteRecipe(BasicNewsRecipe): + __author__ = 'FunThomas' + title = u'iDnes.cz' + publisher = u'MAFRA a.s.' + description = 'iDNES.cz Zprávy, Technet, Komiksy a další' + oldest_article = 3 + max_articles_per_feed = 2 + + feeds = [ + (u'Zprávy', u'http://servis.idnes.cz/rss.asp?c=zpravodaj'), + (u'Sport', u'http://servis.idnes.cz/rss.asp?c=sport'), + (u'Technet', u'http://servis.idnes.cz/rss.asp?c=technet'), + (u'Mobil', u'http://servis.idnes.cz/rss.asp?c=mobil'), + (u'Ekonomika', u'http://servis.idnes.cz/rss.asp?c=ekonomikah'), + #(u'Kultura', u'http://servis.idnes.cz/rss.asp?c=kultura'), + (u'Cestování', u'http://servis.idnes.cz/rss.asp?c=iglobe'), + #(u'Kavárna', u'http://servis.idnes.cz/rss.asp?r=kavarna'), + (u'Komixy', u'http://servis.idnes.cz/rss.asp?c=komiksy') + ] + + + encoding = 'cp1250' + language = 'cs' + cover_url = 'http://g.idnes.cz/u/loga-n4/idnes.gif' + remove_javascript = True + no_stylesheets = True + + remove_attributes = ['width','height'] + remove_tags = [dict(name='div', attrs={'id':['zooming']}), + dict(name='div', attrs={'class':['related','mapa-wrapper']}), + dict(name='table', attrs={'id':['opener-img','portal']}), + dict(name='table', attrs={'class':['video-16ku9']})] + remove_tags_after = [dict(name='div',attrs={'id':['related','related2']})] + + keep_only_tags = [dict(name='div', attrs={'class':['art-full adwords-text','dil-day']}) + ,dict(name='table',attrs={'class':['kemel-box']})] + + def print_version(self, url): + print_url = url + split_url = url.split("?") + if (split_url[0].rfind('dilbert.asp') != -1): #dilbert komix + print_url = print_url.replace('.htm','.gif&tisk=1') + print_url = print_url.replace('.asp','.aspx') + elif (split_url[0].rfind('kemel.asp') == -1): #not Kemel komix + print_url = 'http://zpravy.idnes.cz/tiskni.asp?' + split_url[1] + #kemel kemel print page doesn't work + return print_url + + extra_css = ''' + h1 {font-size:125%; font-weight:bold} + h3 {font-size:110%; font-weight:bold} + ''' diff --git a/resources/recipes/root.recipe b/resources/recipes/root.recipe new file mode 100644 index 0000000000..da065829a7 --- /dev/null +++ b/resources/recipes/root.recipe @@ -0,0 +1,39 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1289939440(BasicNewsRecipe): + __author__ = 'FunThomas' + title = u'Root.cz' + description = u'Zprávičky a články z Root.cz' + publisher = u'Internet Info, s.r.o' + oldest_article = 2 #max stari clanku ve dnech + max_articles_per_feed = 50 #max pocet clanku na feed + + feeds = [ + (u'Články', u'http://www.root.cz/rss/clanky/'), + (u'Zprávičky', u'http://www.root.cz/rss/zpravicky/') + ] + + publication_type = u'magazine' + language = u'cs' + no_stylesheets = True + remove_javascript = True + cover_url = u'http://i.iinfo.cz/urs/logo-root-bila-oranzova-cerna-111089527143118.gif' + + remove_attributes = ['width','height','href'] #,'href' + keep_only_tags = [ + dict(name='h1'), + dict(name='a',attrs={'class':'author'}), + dict(name='p', attrs={'class':'intro'}), + dict(name='div',attrs={'class':'urs'}) + ] + + preprocess_regexps = [ + (re.compile(u'

[^<]*]*>', re.DOTALL),lambda match: '

'), + (re.compile(u'

Tričko tučňák.*', re.DOTALL),lambda match: '') + ] + + extra_css = ''' + h1 {font-size:130%; font-weight:bold} + h3 {font-size:111%; font-weight:bold} + ''' From fdedd9803d2f077e111f9777b601ac1f11a60ad2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 24 Jan 2011 14:32:56 -0700 Subject: [PATCH 2/7] 20 Minutos and La Tribuna de Talavera by Luis Hernandez --- resources/recipes/20_minutos.recipe | 17 +++++++++++++++++ resources/recipes/la_tribuna.recipe | 29 +++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 resources/recipes/20_minutos.recipe create mode 100644 resources/recipes/la_tribuna.recipe diff --git a/resources/recipes/20_minutos.recipe b/resources/recipes/20_minutos.recipe new file mode 100644 index 0000000000..8205c918f5 --- /dev/null +++ b/resources/recipes/20_minutos.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1295310874(BasicNewsRecipe): + title = u'20 Minutos (Boletin)' + __author__ = 'Luis Hernandez' + description = 'Periódico gratuito en español' + cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif' + language = 'es' + + oldest_article = 2 + max_articles_per_feed = 50 + + feeds = [(u'VESPERTINO', u'http://20minutos.feedsportal.com/c/32489/f/478284/index.rss') + , (u'DEPORTES', u'http://20minutos.feedsportal.com/c/32489/f/478286/index.rss') + , (u'CULTURA', u'http://www.20minutos.es/rss/ocio/') + , (u'TV', u'http://20minutos.feedsportal.com/c/32489/f/490877/index.rss') +] diff --git a/resources/recipes/la_tribuna.recipe b/resources/recipes/la_tribuna.recipe new file mode 100644 index 0000000000..11bdda8f3e --- /dev/null +++ b/resources/recipes/la_tribuna.recipe @@ -0,0 +1,29 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1294946868(BasicNewsRecipe): + title = u'La Tribuna de Talavera' + __author__ = 'Luis Hernández' + description = 'Diario de Talavera de la Reina' + cover_url = 'http://www.latribunadetalavera.es/entorno/mancheta.gif' + + oldest_article = 5 + max_articles_per_feed = 50 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + + encoding = 'utf-8' + language = 'es' + timefmt = '[%a, %d %b, %Y]' + + keep_only_tags = [dict(name='div', attrs={'id':['articulo']}) + ,dict(name='div', attrs={'class':['foto']}) + ,dict(name='p', attrs={'id':['texto']}) + ] + + remove_tags_before = dict(name='div' , attrs={'class':['comparte']}) + remove_tags_after = dict(name='div' , attrs={'id':['relacionadas']}) + + + feeds = [(u'Portada', u'http://www.latribunadetalavera.es/rss.html')] From 896133c7d40f39b05b1306f147f814a681908124 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 24 Jan 2011 15:38:27 -0700 Subject: [PATCH 3/7] Sinfest by nadid --- resources/recipes/sinfest.recipe | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 resources/recipes/sinfest.recipe diff --git a/resources/recipes/sinfest.recipe b/resources/recipes/sinfest.recipe new file mode 100644 index 0000000000..bb0ef2e22e --- /dev/null +++ b/resources/recipes/sinfest.recipe @@ -0,0 +1,33 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Nadid ' +''' +http://www.sinfest.net +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class SinfestBig(BasicNewsRecipe): + title = 'Sinfest' + __author__ = 'nadid' + description = 'Sinfest' + reverse_article_order = False + oldest_article = 5 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = True + encoding = 'utf-8' + publisher = 'Tatsuya Ishida/Museworks' + category = 'comic' + language = 'en' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + feeds = [(u'SinFest', u'http://henrik.nyh.se/scrapers/sinfest.rss' )] + def get_article_url(self, article): + return article.get('link') + From b408af075d5dfd9a0e5edd47cae9250b59466970 Mon Sep 17 00:00:00 2001 From: Ben Collier Date: Mon, 24 Jan 2011 18:14:44 -0500 Subject: [PATCH 4/7] Updates to New York Times recipe for downloading high resolution images rather than thumbnails and cleaning up readability --- resources/recipes/nytimes_sub.recipe | 120 +++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 17 deletions(-) diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 2424113e31..863e4b22ba 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -1,6 +1,5 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' @@ -28,6 +27,10 @@ class NYTimes(BasicNewsRecipe): # previous paid versions of the new york times to best sent to the back issues folder on the kindle replaceKindleVersion = False + # download higher resolution images than the small thumbnails typically included in the article + # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper + useHighResImages = True + # includeSections: List of sections to include. If empty, all sections found will be included. # Otherwise, only the sections named will be included. For example, # @@ -90,7 +93,6 @@ class NYTimes(BasicNewsRecipe): (u'Sunday Magazine',u'magazine'), (u'Week in Review',u'weekinreview')] - if headlinesOnly: title='New York Times Headlines' description = 'Headlines from the New York Times' @@ -127,7 +129,7 @@ class NYTimes(BasicNewsRecipe): earliest_date = date.today() - timedelta(days=oldest_article) - __author__ = 'GRiker/Kovid Goyal/Nick Redding' + __author__ = 'GRiker/Kovid Goyal/Nick Redding/Ben Collier' language = 'en' requires_version = (0, 7, 5) @@ -149,7 +151,7 @@ class NYTimes(BasicNewsRecipe): 'dottedLine', 'entry-meta', 'entry-response module', - 'icon enlargeThis', + #'icon enlargeThis', #removed to provide option for high res images 'leftNavTabs', 'metaFootnote', 'module box nav', @@ -163,7 +165,23 @@ class NYTimes(BasicNewsRecipe): 'entry-tags', #added for DealBook 'footer promos clearfix', #added for DealBook 'footer links clearfix', #added for DealBook - 'inlineImage module', #added for DealBook + 'tabsContainer', #added for other blog downloads + 'column lastColumn', #added for other blog downloads + 'pageHeaderWithLabel', #added for other gadgetwise downloads + 'column two', #added for other blog downloads + 'column two last', #added for other blog downloads + 'column three', #added for other blog downloads + 'column three last', #added for other blog downloads + 'column four',#added for other blog downloads + 'column four last',#added for other blog downloads + 'column last', #added for other blog downloads + 'timestamp published', #added for other blog downloads + 'entry entry-related', + 'subNavigation tabContent active', #caucus blog navigation + 'columnGroup doubleRule', + 'mediaOverlay slideshow', + 'headlinesOnly multiline flush', + 'wideThumb', re.compile('^subNavigation'), re.compile('^leaderboard'), re.compile('^module'), @@ -254,7 +272,7 @@ class NYTimes(BasicNewsRecipe): def exclude_url(self,url): if not url.startswith("http"): return True - if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook + if not url.endswith(".html") and 'dealbook.nytimes.com' not in url and 'blogs.nytimes.com' not in url: #added for DealBook return True if 'nytimes.com' not in url: return True @@ -480,7 +498,7 @@ class NYTimes(BasicNewsRecipe): for lidiv in div.findAll('li'): if not skipping: self.handle_article(lidiv) - + self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] return self.filter_ans(self.ans) @@ -591,20 +609,85 @@ class NYTimes(BasicNewsRecipe): if article_date < self.earliest_date: self.log("Skipping article dated %s" % date_str) return None + + #all articles are from today, no need to print the date on every page + try: + if not self.webEdition: + date_tag = soup.find(True,attrs={'class': ['dateline','date']}) + if date_tag: + date_tag.extract() + except: + self.log("Error removing the published date") - kicker_tag = soup.find(attrs={'class':'kicker'}) - if kicker_tag: # remove Op_Ed author head shots - tagline = self.tag_to_string(kicker_tag) - if tagline=='Op-Ed Columnist': - img_div = soup.find('div','inlineImage module') - if img_div: - img_div.extract() - + if self.useHighResImages: + try: + #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs + enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) + if enlargeThisList: + for popupref in enlargeThisList: + popupreflink = popupref.find('a') + if popupreflink: + reflinkstring = str(popupreflink['href']) + refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('") + refend = reflinkstring.find(".html", refstart) + len(".html") + reflinkstring = reflinkstring[refstart:refend] + + popuppage = self.browser.open(reflinkstring) + popuphtml = popuppage.read() + popuppage.close() + if popuphtml: + st = time.localtime() + year = str(st.tm_year) + month = "%.2d" % st.tm_mon + day = "%.2d" % st.tm_mday + imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] + popupSoup = BeautifulSoup(popuphtml) + highResTag = popupSoup.find('img', {'src':highResImageLink}) + if highResTag: + try: + newWidth = highResTag['width'] + newHeight = highResTag['height'] + imageTag = popupref.parent.find("img") + except: + self.log("Error: finding width and height of img") + popupref.extract() + if imageTag: + try: + imageTag['src'] = highResImageLink + imageTag['width'] = newWidth + imageTag['height'] = newHeight + except: + self.log("Error setting the src width and height parameters") + except Exception as e: + self.log("Error pulling high resolution images") + + try: + #remove "Related content" bar + runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline']}) + if runAroundsFound: + for runAround in runAroundsFound: + #find all section headers + hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']}) + if hlines: + for hline in hlines: + hline.extract() + except: + self.log("Error removing related content bar") + + + try: + #in case pulling images failed, delete the enlarge this text + enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) + if enlargeThisList: + for popupref in enlargeThisList: + popupref.extract() + except: + self.log("Error removing Enlarge this text") return self.strip_anchors(soup) def postprocess_html(self,soup, True): - try: if self.one_picture_per_article: # Remove all images after first @@ -766,6 +849,8 @@ class NYTimes(BasicNewsRecipe): try: if len(article.text_summary.strip()) == 0: articlebodies = soup.findAll('div',attrs={'class':'articleBody'}) + if not articlebodies: #added to account for blog formats + articlebodies = soup.findAll('div', attrs={'class':'entry-content'}) #added to account for blog formats if articlebodies: for articlebody in articlebodies: if articlebody: @@ -774,13 +859,14 @@ class NYTimes(BasicNewsRecipe): refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() #account for blank paragraphs and short paragraphs by appending them to longer ones if len(refparagraph) > 0: - if len(refparagraph) > 70: #approximately one line of text + if len(refparagraph) > 140: #approximately two lines of text article.summary = article.text_summary = shortparagraph + refparagraph return else: shortparagraph = refparagraph + " " if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): shortparagraph = shortparagraph + "- " + except: self.log("Error creating article descriptions") return From 64ad0d7a00a69d1912876a66d13a1fb4bb810e0b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 24 Jan 2011 18:11:03 -0700 Subject: [PATCH 5/7] ... --- src/calibre/ebooks/metadata/sources/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 src/calibre/ebooks/metadata/sources/__init__.py diff --git a/src/calibre/ebooks/metadata/sources/__init__.py b/src/calibre/ebooks/metadata/sources/__init__.py new file mode 100644 index 0000000000..68dfb8d2b5 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + + From a843fae9d7a189463a0c8227d48c5952b8c9c99c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 24 Jan 2011 18:23:45 -0700 Subject: [PATCH 6/7] ... --- src/calibre/gui2/convert/search_and_replace.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py index ec59268ec8..88446344ec 100644 --- a/src/calibre/gui2/convert/search_and_replace.py +++ b/src/calibre/gui2/convert/search_and_replace.py @@ -42,9 +42,15 @@ class SearchAndReplaceWidget(Widget, Ui_Form): def break_cycles(self): Widget.break_cycles(self) - self.opt_sr1_search.doc_update.disconnect() - self.opt_sr2_search.doc_update.disconnect() - self.opt_sr3_search.doc_update.disconnect() + def d(x): + try: + x.disconnect() + except: + pass + + d(self.opt_sr1_search) + d(self.opt_sr2_search) + d(self.opt_sr3_search) self.opt_sr1_search.break_cycles() self.opt_sr2_search.break_cycles() From 559ff8c59f26a25731350fe16791ead51233ac50 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 24 Jan 2011 19:57:26 -0700 Subject: [PATCH 7/7] Manual updates --- src/calibre/manual/faq.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 5ebe91bc76..7a04e0f642 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -310,7 +310,9 @@ What formats does |app| read metadata from? Where are the book files stored? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When you first run |app|, it will ask you for a folder in which to store your books. Whenever you add a book to |app|, it will copy the book into that folder. Books in the folder are nicely arranged into sub-folders by Author and Title. Metadata about the books is stored in the file ``metadata.db`` (which is a sqlite database). +When you first run |app|, it will ask you for a folder in which to store your books. Whenever you add a book to |app|, it will copy the book into that folder. Books in the folder are nicely arranged into sub-folders by Author and Title. Note that the contents of this folder are automatically managed by |app|, **do not** add any files/folders manually to this folder, as they may be automatically deleted. If you want to add a file associated to a particular book, use the top right area of :guilabel:`Edit metadata` dialog to do so. Then, |app| will automatically put that file into the correct folder and move it around when the title/author changes. + +Metadata about the books is stored in the file ``metadata.db`` at the top level of the library folder This file is is a sqlite database. When backing up your library make sure you copy the entire folder and all its sub-folders. Why doesn't |app| let me store books in my own directory structure? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~