From 047e12391c0001a3bced40107b4b5d99b7ebb737 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 10 Feb 2012 21:39:22 +0530 Subject: [PATCH 01/29] Fix #930190 (Updated recipe for Pescanik) --- recipes/icons/pescanik.png | Bin 1087 -> 289 bytes recipes/pescanik.recipe | 50 ++++++++++++++----------------------- 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/recipes/icons/pescanik.png b/recipes/icons/pescanik.png index 7afbcd7ba21347f4ae777e251dca30f1e8b0aaff..8ac7ee50b8ed6360fe5b6b190f1bc14fd25eff4a 100644 GIT binary patch delta 273 zcmV+s0q*|42%!Ry8Gix*005AYXf^-<0O(0XK~#90WBmXBKLaU%k>(nha7p}m`-lV$ zpSN_sZgc+k?ITedz8zojWm^x!_fK#70|{&R{rJkqHMJ1--=|mKFUrHG0qBNzGZTPj z!$p3aSP8Tcrv{+vfA1cn8ToN>KG0e$8a}SB{eAlwR8&Zo!+#|~bK7OUO|)c~{+Xa>;ftg2Q5aT8gD<&d2O3}R5A zF)#q_6pWh$((tO)>F>M8{2|R;mJtlt05B4PRskI>+I$-7B%l+4ZosM<4j^i1pZEZv X;fGD%f6gZW015yANkvXXu0mjfxkY?c literal 1087 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GXl4m>B|mLRdT*IRD&D1$u_F zz$3C4NPB>>+sSM@kYHJXV>*xq=NyKZV`~osIeDHgjv*44OV6L?V^-vF2o!Ow?D+qG zyK-jka^E)D^g`wh|Jdd={1iVi_uT^KwJ&&&RGI%|a+$lCP3FO!;2*3{ETofW&q$1j za(n$OZ*Q+`8_zY3C8}9%iR>Tea^Fdu&`~U0*>WK1;^%fQd;9M*xo14&5uT9q>pIY8 z)e_f;lH{V)#FA9q6d=K1WME*SYiOivY#3r-Ze?n0Wn!pnU}0roz&K&SO_+w<{FKbJ XN&*_<6tBGnYGCkm^>bP0l+XkK8rOI& diff --git a/recipes/pescanik.recipe b/recipes/pescanik.recipe index c07434535f..a60c35b4b8 100644 --- a/recipes/pescanik.recipe +++ b/recipes/pescanik.recipe @@ -1,18 +1,18 @@ +# -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = '2008-2010, Darko Miletic ' +__copyright__ = '2008-2012, Darko Miletic ' ''' pescanik.net ''' import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class Pescanik(BasicNewsRecipe): - title = 'Pescanik' + title = 'Peščanik' __author__ = 'Darko Miletic' - description = 'Pescanik' - publisher = 'Pescanik' + description = 'Peščanik je udruženje građana osnovano 2006. godine. Glavni proizvod Peščanika je radio emisija koja je emitovana na Radiju B92 od 02.02.2000. do 16.06.2011, a od septembra 2011. se emituje na osam radio stanica u Srbiji, Crnoj Gori i BiH' + publisher = 'Peščanik' category = 'news, politics, Serbia' oldest_article = 10 max_articles_per_feed = 100 @@ -20,8 +20,13 @@ class Pescanik(BasicNewsRecipe): use_embedded_content = False encoding = 'utf-8' language = 'sr' - publication_type = 'newsportal' - extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,"Lucida Grande",Tahoma,Verdana,sans1,sans-serif} .contentheading{font-size: x-large; font-weight: bold} .small{font-size: small} .createdate{font-size: x-small; font-weight: bold} ' + publication_type = 'newsportal' + masthead_url = 'http://pescanik.net/wp-content/uploads/2011/10/logo1.png' + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Verdana,Arial,Tahoma,sans1,sans-serif} + #BlogTitle{font-size: xx-large; font-weight: bold} + """ conversion_options = { 'comment' : description @@ -32,29 +37,12 @@ class Pescanik(BasicNewsRecipe): preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - remove_attributes = ['valign','colspan','width','height','align','alt'] - - remove_tags = [dict(name=['object','link','meta','script'])] - - keep_only_tags = [ - dict(attrs={'class':['contentheading','small','createdate']}) - ,dict(name='td', attrs={'valign':'top','colspan':'2'}) - ] - - feeds = [(u'Pescanik Online', u'http://www.pescanik.net/index.php?option=com_rd_rss&id=12')] + remove_tags = [dict(name=['object','link','meta','script','iframe','embed'])] + keep_only_tags = [dict(attrs={'id':['BlogTitle','BlogDate','BlogContent']})] + feeds = [ + (u'Autori' , u'http://pescanik.net/category/autori/feed/'), + (u'Prevodi', u'http://pescanik.net/category/prevodi/feed/') + ] def print_version(self, url): - nurl = url.replace('/index.php','/index2.php') - return nurl + '&pop=1&page=0' - - def preprocess_html(self, soup): - st = soup.findAll('td') - for it in st: - it.name='p' - for pt in soup.findAll('img'): - brtag = Tag(soup,'br') - brtag2 = Tag(soup,'br') - pt.append(brtag) - pt.append(brtag2) - return soup + return url + 'print/' \ No newline at end of file From b6dfa46b766ff1d8f64afb5e51c202e8944d23ff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 10 Feb 2012 21:51:30 +0530 Subject: [PATCH 02/29] Driver for PocketBook A10 --- src/calibre/devices/android/driver.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 97e494b2dd..69eac39c41 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -101,6 +101,7 @@ class ANDROID(USBMS): 0xc001 : [0x0226], 0xc004 : [0x0226], 0x8801 : [0x0226, 0x0227], + 0xe115 : [0x0216], # PocketBook A10 }, # Acer @@ -165,7 +166,8 @@ class ANDROID(USBMS): 'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS', 'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA', 'GENERIC-', 'ZTE', 'MID', 'QUALCOMM', 'PANDIGIT', 'HYSTON', - 'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP'] + 'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP', + 'POCKET'] WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE', '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897', 'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', @@ -179,13 +181,13 @@ class ANDROID(USBMS): 'ALPANDIGITAL', 'ANDROID_MID', 'VTAB1008', 'EMX51_BBG_ANDROI', 'UMS', '.K080', 'P990', 'LTE', 'MB853', 'GT-S5660_CARD', 'A107', 'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET', 'RK29_SDK', 'MB855', - 'XT910'] + 'XT910', 'BOOK_A10'] WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD', '__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL', 'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853', - 'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910'] + 'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD'] OSX_MAIN_MEM = 'Android Device Main Memory' From 9f28a510617d978650bd99e4242430e13de95032 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2012 09:05:26 +0530 Subject: [PATCH 03/29] Microwave and RF magazine by kiavash --- recipes/microwave_and_rf.recipe | 217 ++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 recipes/microwave_and_rf.recipe diff --git a/recipes/microwave_and_rf.recipe b/recipes/microwave_and_rf.recipe new file mode 100644 index 0000000000..e3eee9dab1 --- /dev/null +++ b/recipes/microwave_and_rf.recipe @@ -0,0 +1,217 @@ +#!/usr/bin/env python +## +## Title: Microwave and RF +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html + +# Feb 2012: Initial release + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +''' +mwrf.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.magick import Image + +class Microwave_and_RF(BasicNewsRecipe): + + Convert_Grayscale = False # Convert images to gray scale or not + + # Add sections that want to be excluded from the magazine + exclude_sections = [] + + # Add sections that want to be included from the magazine + include_sections = [] + + title = u'Microwave and RF' + __author__ = 'kiavash' + description = u'Microwave and RF Montly Magazine' + publisher = 'Penton Media, Inc.' + publication_type = 'magazine' + site = 'http://mwrf.com' + + language = 'en' + asciiize = True + timeout = 120 + simultaneous_downloads = 1 # very peaky site! + + # Main article is inside this tag + keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})] + + no_stylesheets = True + remove_javascript = True + + # Flattens all the tables to make it compatible with Nook + conversion_options = {'linearize_tables' : True} + + remove_tags = [ + dict(name='span', attrs={'class':'body12'}), + ] + + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] + + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { font-size: 175%; font-weight: bold; } \ + h2 { font-size: 150%; font-weight: bold; } \ + h3 { font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { font-size: 100%; font-weight: bold; }' + + # Remove the line breaks and float left/right and picture width/height. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'float:.*?'), lambda m: ''), + (re.compile(r'width:.*?px'), lambda m: ''), + (re.compile(r'height:.*?px'), lambda m: '') + ] + + + def print_version(self, url): + url = re.sub(r'.html', '', url) + url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url) + return url + + # Need to change the user agent to avoid potential download errors + def get_browser(self, *args, **kwargs): + from calibre import browser + kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0' + return browser(*args, **kwargs) + + + def parse_index(self): + + # Fetches the main page of Microwave and RF + soup = self.index_to_soup(self.site) + + # Searches the site for Issue ID link then returns the href address + # pointing to the latest issue + latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href') + + # Fetches the index page for of the latest issue + soup = self.index_to_soup(latest_issue) + + # Finds the main section of the page containing cover, issue date and + # TOC + ts = soup.find('div', attrs={'id':'columnContainer'}) + + # Finds the issue date + ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize() + self.log('Found Current Issue:', ds) + self.timefmt = ' [%s]'%ds + + # Finds the cover image + cover = ts.find('img', src = lambda x: x and 'Cover' in x) + if cover is not None: + self.cover_url = self.site + cover['src'] + self.log('Found Cover image:', self.cover_url) + + feeds = [] + article_info = [] + + # Finds all the articles (tiles and links) + articles = ts.findAll('a', attrs={'class':'commonArticleTitle'}) + + # Finds all the descriptions + descriptions = ts.findAll('span', attrs={'class':'commonCopy'}) + + # Find all the sections + sections = ts.findAll('span', attrs={'class':'kicker'}) + + title_number = 0 + + # Goes thru all the articles one by one and sort them out + for section in sections: + title_number = title_number + 1 + + # Removes the unwanted sections + if self.tag_to_string(section) in self.exclude_sections: + continue + + # Only includes the wanted sections + if self.include_sections: + if self.tag_to_string(section) not in self.include_sections: + continue + + + title = self.tag_to_string(articles[title_number]) + url = articles[title_number].get('href') + if url.startswith('/'): + url = self.site + url + + self.log('\tFound article:', title, 'at', url) + desc = self.tag_to_string(descriptions[title_number]) + self.log('\t\t', desc) + + article_info.append({'title':title, 'url':url, 'description':desc, + 'date':self.timefmt}) + + if article_info: + feeds.append((self.title, article_info)) + + #self.log(feeds) + return feeds + + def postprocess_html(self, soup, first): + if self.Convert_Grayscale: + #process all the images + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + if img < 0: + raise RuntimeError('Out of memory') + img.type = "GrayscaleType" + img.save(iurl) + return soup + + def preprocess_html(self, soup): + + # Includes all the figures inside the final ebook + # Finds all the jpg links + for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}): + + # makes sure that the link points to the absolute web address + if figure['href'].startswith('/'): + figure['href'] = self.site + figure['href'] + + figure.name = 'img' # converts the links to img + figure['src'] = figure['href'] # with the same address as href + figure['style'] = 'display:block' # adds /n before and after the image + del figure['href'] + del figure['target'] + + # Makes the title standing out + for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}): + title.name = 'h1' + del title['href'] + del title['target'] + + # Makes the section name more visible + for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}): + section_name.name = 'h5' + del section_name['href'] + del section_name['target'] + + # Removes all unrelated links + for link in soup.findAll('a', attrs = {'href': True}): + link.name = 'font' + del link['href'] + del link['target'] + + return soup From 25e8de62448ea20612d566eb9e088336e2e1fbfe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2012 09:10:59 +0530 Subject: [PATCH 04/29] Update Common Dreams --- recipes/common_dreams.recipe | 89 ++++++++++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 19 deletions(-) diff --git a/recipes/common_dreams.recipe b/recipes/common_dreams.recipe index 5443b5890b..62edfe8684 100644 --- a/recipes/common_dreams.recipe +++ b/recipes/common_dreams.recipe @@ -1,38 +1,89 @@ +#!/usr/bin/env python +## +## Title: Common Dreams +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +# Feb 2012: Cleaned up the output to have only the main article + +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +''' +commondreams.org +''' + +import re from calibre.web.feeds.news import BasicNewsRecipe class CommonDreams(BasicNewsRecipe): # Identify the recipe title = u'Common Dreams' - description = u'Progressive news and views' + description = u'Breaking News & Views for the Progressive Community.' + cover_url = 'https://s3.amazonaws.com/s3.commondreams.org/images/common-dreams.png' __author__ = u'XanthanGum' language = 'en' - # Format the text - - extra_css = ''' - body{font-family:verdana,arial,helvetica,geneva,sans-serif ;} - h1{font-size: xx-large;} - h2{font-size: large;} - ''' - - # Pick no article older than seven days and limit the number of articles per feed to 100 - oldest_article = 7 max_articles_per_feed = 100 - # Remove everything before the article + no_stylesheets = True + remove_javascript = True + + # Flattens all the tables to make it compatible with Nook + conversion_options = {'linearize_tables' : True} - remove_tags_before = dict(name = 'div', attrs = {'id':'node-header'}) + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] - # Remove everything after the article + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { font-size: 175%; font-weight: bold; } \ + h2 { font-size: 150%; font-weight: bold; } \ + h3 { font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { font-size: 100%; font-weight: bold; }' + + # Remove the line breaks and float left/right and picture width/height. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'float:.*?'), lambda m: ''), + (re.compile(r'width:.*?px'), lambda m: ''), + (re.compile(r'height:.*?px'), lambda m: ''), + (re.compile(r''), lambda m: ''), + (re.compile(r''), lambda m: ''), + ] - remove_tags_after = dict(name = 'div', attrs = {'class':'copyright-info'}) + # Main article is inside this tag + keep_only_tags = [ + dict(name='div', attrs={'id':lambda x: x and 'node-' in x}), + ] + + remove_tags = [ + dict(name='div', attrs={'class':'node-links clear-block'}), # remove Share options + ] + + # Identify the news feeds - feeds = [(u'Headlines', u'http://www.commondreams.org/feed/headlines_rss'), - (u'Further News Articles', u'http://www.commondreams.org/feed/further_rss'), - (u'Views', u'http://www.commondreams.org/feed/views_rss'), - (u'Progressive Newswire', u'http://www.commondreams.org/feed/newswire_rss')] + feeds = [(u'Headlines', u'https://www.commondreams.org/feed/headlines_rss'), + (u'Further News Articles', u'https://www.commondreams.org/feed/further_rss'), + (u'Views', u'https://www.commondreams.org/feed/views_rss'), + (u'Progressive Newswire', u'https://www.commondreams.org/feed/newswire_rss')] + + + def print_version(self, url): + url = url + '?print' + return url \ No newline at end of file From 0b4f34cb2d96b4d187ac9d4df8ae0cb39049c074 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2012 09:15:04 +0530 Subject: [PATCH 05/29] Consortium News by kiavash --- recipes/consortium_news.recipe | 71 ++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 recipes/consortium_news.recipe diff --git a/recipes/consortium_news.recipe b/recipes/consortium_news.recipe new file mode 100644 index 0000000000..f153be6fc7 --- /dev/null +++ b/recipes/consortium_news.recipe @@ -0,0 +1,71 @@ +#!/usr/bin/env python +## +## Title: Consortium News +## +## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html + +# Feb 2012: Initial release +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +''' +consortiumnews.com +''' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ConsortiumNews(BasicNewsRecipe): + + title = u'Consortium News' + publisher = 'Copyright © 2012 Consortiumnews. All Rights Reserved.' + language = 'en' + __author__ = 'kiavash' + + oldest_article = 7 + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + + conversion_options = {'linearize_tables' : True} # Flattens all the tables to make it compatible with Nook + + remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] + + # Specify extra CSS - overrides ALL other CSS (IE. Added last). + extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ + .introduction, .first { font-weight: bold; } \ + .cross-head { font-weight: bold; font-size: 125%; } \ + .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ + .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ + .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ + .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ + font-size: 80%; font-style: italic; margin: 1px auto; } \ + .story-date, .published { font-size: 80%; } \ + table { width: 100%; } \ + td img { display: block; margin: 5px auto; } \ + ul { padding-top: 10px; } \ + ol { padding-top: 10px; } \ + li { padding-top: 5px; padding-bottom: 5px; } \ + h1 { font-size: 175%; font-weight: bold; } \ + h2 { font-size: 150%; font-weight: bold; } \ + h3 { font-size: 125%; font-weight: bold; } \ + h4, h5, h6 { font-size: 100%; font-weight: bold; }' + + # Remove the line breaks and float left/right and picture width/height. + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'float:.*?'), lambda m: ''), + (re.compile(r'width:.*?px'), lambda m: ''), + (re.compile(r'height:.*?px'), lambda m: ''), + (re.compile(r''), lambda h1: ''), + (re.compile(r''), lambda h2: ''), + ] + + # Main article is inside this tag + keep_only_tags = [dict(name='div', attrs={'id':lambda x: x and 'post-' in x})] + + remove_tags = [ + dict(name='div', attrs={'class':'sociable'}), # remove 'Share this Article' + dict(name='p', attrs={'class':'tags'}), # remove 'Tags: ... ' + ] + + feeds = [(u'Consortium News', u'http://feeds.feedburner.com/Consortiumnewscom')] From bd27557d5885436322e24a0be1fd10594e585cd7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2012 10:23:41 +0530 Subject: [PATCH 06/29] Kindle driver: Add an option to allow using page counts stored in a custom column. Go to Preferences->Plugins and customize the Kindle driver, to tell it to use a custom column to get page count data. --- src/calibre/devices/kindle/apnx.py | 53 +++++++++++++++---- src/calibre/devices/kindle/driver.py | 23 +++++++- .../gui2/device_drivers/configwidget.py | 1 + 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/src/calibre/devices/kindle/apnx.py b/src/calibre/devices/kindle/apnx.py index a051c84be6..75b0804e6a 100644 --- a/src/calibre/devices/kindle/apnx.py +++ b/src/calibre/devices/kindle/apnx.py @@ -19,7 +19,12 @@ class APNXBuilder(object): Create an APNX file using a pseudo page mapping. ''' - def write_apnx(self, mobi_file_path, apnx_path, accurate=True): + def write_apnx(self, mobi_file_path, apnx_path, accurate=True, page_count=0): + ''' + If you want a fixed number of pages (such as from a custom column) then + pass in a value to page_count, otherwise a count will be estimated + using either the fast or accurate algorithm. + ''' # Check that this is really a MOBI file. with open(mobi_file_path, 'rb') as mf: ident = PdbHeaderReader(mf).identity() @@ -28,16 +33,19 @@ class APNXBuilder(object): # Get the pages depending on the chosen parser pages = [] - if accurate: - try: - pages = self.get_pages_accurate(mobi_file_path) - except: - # Fall back to the fast parser if we can't - # use the accurate one. Typically this is - # due to the file having DRM. - pages = self.get_pages_fast(mobi_file_path) + if page_count: + pages = self.get_pages_exact(mobi_file_path, page_count) else: - pages = self.get_pages_fast(mobi_file_path) + if accurate: + try: + pages = self.get_pages_accurate(mobi_file_path) + except: + # Fall back to the fast parser if we can't + # use the accurate one. Typically this is + # due to the file having DRM. + pages = self.get_pages_fast(mobi_file_path) + else: + pages = self.get_pages_fast(mobi_file_path) if not pages: raise Exception(_('Could not generate page mapping.')) @@ -77,6 +85,31 @@ class APNXBuilder(object): return apnx + def get_pages_exact(self, mobi_file_path, page_count): + ''' + Given a specified page count (such as from a custom column), + create our array of pages for the apnx file by dividing by + the content size of the book. + ''' + pages = [] + count = 0 + + with open(mobi_file_path, 'rb') as mf: + phead = PdbHeaderReader(mf) + r0 = phead.section_data(0) + text_length = struct.unpack('>I', r0[4:8])[0] + + chars_per_page = int(text_length / page_count) + while count < text_length: + pages.append(count) + count += chars_per_page + + if len(pages) > page_count: + # Rounding created extra page entries + pages = pages[:page_count] + + return pages + def get_pages_fast(self, mobi_file_path): ''' 2300 characters of uncompressed text per page. This is diff --git a/src/calibre/devices/kindle/driver.py b/src/calibre/devices/kindle/driver.py index 1b10ce3050..c71eb67985 100644 --- a/src/calibre/devices/kindle/driver.py +++ b/src/calibre/devices/kindle/driver.py @@ -302,19 +302,28 @@ class KINDLE2(KINDLE): ' this information to the Kindle when uploading MOBI files by' ' USB. Note that the page numbers do not correspond to any paper' ' book.'), - _('Use slower but more accurate page number generation') + + _('Use slower but more accurate page number calculation') + ':::' + _('There are two ways to generate the page number information. Using the more accurate ' 'generator will produce pages that correspond better to a printed book. ' 'However, this method is slower and will slow down sending files ' 'to the Kindle.'), + _('Custom column name to retrieve page counts from') + + ':::' + + _('If you have a custom column in your library that you use to ' + 'store the page count of books, you can have calibre use that ' + 'information, instead of calculating a page count. Specify the ' + 'name of the custom column here, for example, #pages. '), + ] EXTRA_CUSTOMIZATION_DEFAULT = [ True, False, + '', ] OPT_APNX = 0 OPT_APNX_ACCURATE = 1 + OPT_APNX_CUST_COL = 2 def books(self, oncard=None, end_session=True): bl = USBMS.books(self, oncard=oncard, end_session=end_session) @@ -380,10 +389,20 @@ class KINDLE2(KINDLE): if not os.path.exists(path): os.makedirs(path) + cust_col_name = opts.extra_customization[self.OPT_APNX_CUST_COL] + custom_page_count = 0 + if cust_col_name: + try: + custom_page_count = int(metadata.get(cust_col_name, 0)) + except: + pass + apnx_path = '%s.apnx' % os.path.join(path, filename) apnx_builder = APNXBuilder() try: - apnx_builder.write_apnx(filepath, apnx_path, accurate=opts.extra_customization[self.OPT_APNX_ACCURATE]) + apnx_builder.write_apnx(filepath, apnx_path, + accurate=opts.extra_customization[self.OPT_APNX_ACCURATE], + page_count=custom_page_count) except: print 'Failed to generate APNX' import traceback diff --git a/src/calibre/gui2/device_drivers/configwidget.py b/src/calibre/gui2/device_drivers/configwidget.py index 2e5b77e5bc..94843f90e3 100644 --- a/src/calibre/gui2/device_drivers/configwidget.py +++ b/src/calibre/gui2/device_drivers/configwidget.py @@ -82,6 +82,7 @@ class ConfigWidget(QWidget, Ui_ConfigWidget): self.opt_extra_customization.append(QLineEdit(self)) l = QLabel(label_text) l.setToolTip(tt) + self.opt_extra_customization[i].setToolTip(tt) l.setBuddy(self.opt_extra_customization[i]) l.setWordWrap(True) self.opt_extra_customization[i].setText(settings.extra_customization[i]) From 6af42f147eb2bc708671450ad603797372aeb11a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2012 22:46:12 +0530 Subject: [PATCH 07/29] ... --- src/calibre/gui2/viewer/main.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index f37465d02a..64521ecdd7 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -131,9 +131,16 @@ class Metadata(QLabel): class DoubleSpinBox(QDoubleSpinBox): + def __init__(self, *args, **kwargs): + QDoubleSpinBox.__init__(self, *args, **kwargs) + self.tt = _('Position in book') + self.setToolTip(self.tt) + def set_value(self, val): self.blockSignals(True) self.setValue(val) + self.setToolTip(self.tt + + ' [{0:.0%}]'.format(float(val)/self.maximum())) self.blockSignals(False) class HelpfulLineEdit(QLineEdit): @@ -197,7 +204,6 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.metadata = Metadata(self) self.pos = DoubleSpinBox() self.pos.setDecimals(1) - self.pos.setToolTip(_('Position in book')) self.pos.setSuffix('/'+_('Unknown')+' ') self.pos.setMinimum(1.) self.pos.setMinimumWidth(150) From c9a6bd6175e091cafd17180d1ca36c4df039010d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2012 22:51:41 +0530 Subject: [PATCH 08/29] Fix Daily Mirror --- recipes/daily_mirror.recipe | 47 +++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/recipes/daily_mirror.recipe b/recipes/daily_mirror.recipe index 800eaf10e9..d6794b1d97 100644 --- a/recipes/daily_mirror.recipe +++ b/recipes/daily_mirror.recipe @@ -5,7 +5,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): description = 'News as provide by The Daily Mirror -UK' __author__ = 'Dave Asbury' - # last updated 26/12/11 + # last updated 11/2/12 language = 'en_GB' cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' @@ -14,35 +14,58 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): oldest_article = 1 - max_articles_per_feed = 20 + max_articles_per_feed = 5 remove_empty_feeds = True remove_javascript = True no_stylesheets = True auto_cleanup = True + #conversion_options = { 'linearize_tables' : True } + + + #keep_only_tags = [ + # dict(name='h1'), + # dict(name='div',attrs={'id' : 'body-content'}), + #dict(name='div',atts={'class' : 'article-body'}), + #dict(attrs={'class' : ['article-attr','byline append-1','published']}), + #dict(name='p'), + # ] + + #remove_tags_after = [dict (name='div',attrs={'class' : 'related'})] + remove_tags = [ dict(name='title'), dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}), + # dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}), + #dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}), + #dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}), ] + # preprocess_regexps = [ + #(re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match: '')] preprocess_regexps = [ (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')] preprocess_regexps = [ (re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')] + #preprocess_regexps = [ + #(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')] feeds = [ - (u'News', u'http://www.mirror.co.uk/news/rss.xml') - ,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml') - ,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml') - ,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml') - ,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml') - ,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml') - ,(u'Sport','http://www.mirror.co.uk/sport/rss.xml') - ,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml') - ,(u'Advice','http://www.mirror.co.uk/advice/rss.xml') - ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml') + (u'UK News', u'http://feed43.com/0287771688643868.xml') + ,(u'Tech News', u'http://feed43.com/2455520588350501.xml') + ,(u'Weird World','http://feed43.com/0863800333634654.xml') + ,(u'Sport','http://feed43.com/7713243036546130.xml') + ,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml') + ,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml') + ,(u'Sport : Other','http://feed43.com/4501416886323415.xml') + ,(u'TV and Film','http://feed43.com/5238302853765104.xml') + ,(u'Celebs','http://feed43.com/8770061048844683.xml') + ,(u'Life Style : Family','http://feed43.com/4356170742410338.xml') + ,(u'Travel','http://feed43.com/1436576006476607.xml') + + # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml') ] From 5b10bea77b447495d75ce519357f41e3f7070b06 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2012 22:55:19 +0530 Subject: [PATCH 09/29] Catholic Daily Readings by adoucette --- recipes/catholic_daily_readings.recipe | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 recipes/catholic_daily_readings.recipe diff --git a/recipes/catholic_daily_readings.recipe b/recipes/catholic_daily_readings.recipe new file mode 100644 index 0000000000..f90adb9a6e --- /dev/null +++ b/recipes/catholic_daily_readings.recipe @@ -0,0 +1,11 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class BasicUserRecipe1328971305(BasicNewsRecipe): + title = u'Catholic Daily Readings' + language = 'en' + __author__ = 'adoucette' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Daily Readings - USCCB', u'http://www.usccb.org/bible/readings/rss/'), (u'Daily Reflection - One Bread One Body', u'http://www.presentationministries.com/general/rss.asp'), (u'Mass Readings - Universalis', u'http://www.universalis.com/atommass3.xml'), (u'Saint Of The Day - CNA', u'http://feeds.feedburner.com/catholicnewsagency/saintoftheday')] From ea0432d3fe2282158c48f14d1f9ffd6e6f4720ff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 Feb 2012 11:13:15 +0530 Subject: [PATCH 10/29] Fix #930648 (Automatic adding - Languages field) --- src/calibre/ebooks/metadata/meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 73ba7e77f4..1b8855a157 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -233,7 +233,7 @@ def forked_read_metadata(path, tdir): f.write(mi.cover_data[1]) mi.cover_data = (None, None) mi.cover = 'cover.jpg' - opf = metadata_to_opf(mi) + opf = metadata_to_opf(mi, default_lang='und') with open(os.path.join(tdir, 'metadata.opf'), 'wb') as f: f.write(opf) From 24e4e1cf0027a8443c4d27901575f9989b63c94a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 Feb 2012 12:00:11 +0530 Subject: [PATCH 11/29] EPUB Output: Fix splitting breaking internal links in the epub, if the links pointed to files with URL unsafe characters in their file names. Fixes #929966 (HTML links break when converting epub to epub) --- src/calibre/ebooks/oeb/transforms/split.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 96e4b08079..8d826b9963 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -15,8 +15,8 @@ from lxml.etree import XPath as _XPath from lxml import etree from lxml.cssselect import CSSSelector -from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ - urldefrag, rewrite_links, urlunquote, barename, XHTML +from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, + urldefrag, rewrite_links, urlunquote, barename, XHTML, urlnormalize) from calibre.ebooks.epub import rules XPath = functools.partial(_XPath, namespaces=NAMESPACES) @@ -159,6 +159,7 @@ class Split(object): except ValueError: # Unparseable URL return url + href = urlnormalize(href) if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None] From fcf4957e91a62d5d89f88e269fb4cb11d1262e78 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 Feb 2012 19:04:45 +0530 Subject: [PATCH 12/29] Remove de Volksrant subscription version as it is no longer available --- recipes/volksrant_sub.recipe | 115 ----------------------------------- 1 file changed, 115 deletions(-) delete mode 100644 recipes/volksrant_sub.recipe diff --git a/recipes/volksrant_sub.recipe b/recipes/volksrant_sub.recipe deleted file mode 100644 index 8a5f1543b5..0000000000 --- a/recipes/volksrant_sub.recipe +++ /dev/null @@ -1,115 +0,0 @@ -from calibre import strftime -from calibre.web.feeds.news import BasicNewsRecipe - -class Volkskrant_full(BasicNewsRecipe): - # This recipe will download the Volkskrant newspaper, - # from the subscribers site. It requires a password. - # Known issues are: articles that are spread out over - # multiple pages will appear multiple times. Pages - # that contain only adverts will appear, but empty. - # The supplement 'Volkskrant Magazine' on saturday - # is currently not downloaded. - # You can set a manual date, to download an archived - # newspaper. Volkskrant stores over a month at the - # moment of writing. To do so I suggest you unmark - # the date on the line below, and insert it in the title. Then - # follow the instructions marked further below. - - title = 'De Volkskrant (subscription)' # [za, 13 nov 2010]' - __author__ = u'Selcal' - description = u"Volkskrant" - oldest_article = 30 - max_articles_per_feed = 100 - no_stylesheets = True - language = 'nl' - use_embedded_content = False - simultaneous_downloads = 1 - delay = 1 - needs_subscription = True - # Set RETRIEVEDATE to 'yyyymmdd' to load an older - # edition. Otherwise keep '%Y%m%d' - # When setting a manual date, unmark and add the date - # to the title above, and unmark the timefmt line to stop - # Calibre from adding today's date in addition. - - # timefmt = '' - RETRIEVEDATE = strftime('%Y%m%d') - INDEX_MAIN = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/#text' - INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/' - LOGIN = 'http://www.volkskrant.nl/vk/user/loggedIn.do' - remove_tags = [dict(name='address')] - cover_url = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/page.jpg' - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - - if self.username is not None and self.password is not None: - br.open(self.LOGIN) - br.select_form(nr = 0) - br['username'] = self.username - br['password'] = self.password - br.submit() - return br - - def parse_index(self): - krant = [] - def strip_title(_title): - i = 0 - while ((_title[i] <> ":") and (i <= len(_title))): - i = i + 1 - return(_title[0:i]) - for temp in range (5): - try: - soup = self.index_to_soup(self.INDEX_MAIN) - break - except: - #print '(Retrying main index load)' - continue - mainsoup = soup.find('td', attrs={'id': 'select_page_top'}) - for option in mainsoup.findAll('option'): - articles = [] - _INDEX = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/#text' - _INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/' - #print '' - #print '<------- Processing section: ' + _INDEX + ' ------------------------->' - for temp in range (5): - try: - soup = self.index_to_soup(_INDEX) - break - except: - #print '(Retrying index load)' - continue - for item in soup.findAll('area'): - art_nr = item['class'] - attrname = art_nr[0:12] + '_section' + option['value'][0:5] + '_' + art_nr[26:len(art_nr)] - #print '==> Found: ' + attrname; - index_title = soup.find('div', attrs={'class': attrname}) - get_title = index_title['title']; - _ARTICLE = _INDEX_ARTICLE + attrname + '.html#text' - title = get_title; - #print '--> Title: ' + title; - #print '--> URL: ' + _ARTICLE; - for temp in range (5): - try: - souparticle = self.index_to_soup(_ARTICLE); - break - except: - print '(Retrying URL load)' - continue - headerurl = souparticle.findAll('frame')[0]['src']; - #print '--> Read frame name for header: ' + headerurl; - url = _INDEX_ARTICLE + headerurl[0:len(headerurl)-12] + '_text.html'; - #print '--> Corrected URL: ' + url; - if (get_title <> ''): - title = strip_title(get_title) - date = strftime(' %B %Y') - if (title <> ''): - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':'' - }) - krant.append( (option.string, articles)) - return krant - From 4451b829920d77a76cfdd05fe33c3eba5d0fd3f6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 Feb 2012 19:14:52 +0530 Subject: [PATCH 13/29] ... --- src/calibre/manual/Makefile | 2 +- src/calibre/manual/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/manual/Makefile b/src/calibre/manual/Makefile index dc72b40f3f..c1a2279abf 100644 --- a/src/calibre/manual/Makefile +++ b/src/calibre/manual/Makefile @@ -25,7 +25,7 @@ clean: html: mkdir -p .build/html .build/doctrees - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) .build/html + $(SPHINXBUILD) -b html -t online $(ALLSPHINXOPTS) .build/html @echo @echo "Build finished. The HTML pages are in .build/html." diff --git a/src/calibre/manual/index.rst b/src/calibre/manual/index.rst index 81b8ac88f1..d0d6bfb9b5 100755 --- a/src/calibre/manual/index.rst +++ b/src/calibre/manual/index.rst @@ -17,7 +17,7 @@ To get started with more advanced usage, you should read about the :ref:`Graphic .. only:: online - An ebook version of this user manual is available in `EPUB format `_. + **An ebook version of this user manual is available in** `EPUB format `_. Sections ------------ From 2eab5b4486445193d535fb29799c613498c3f545 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 13 Feb 2012 09:56:06 +0530 Subject: [PATCH 14/29] Updated Le Devoir --- recipes/ledevoir.recipe | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/recipes/ledevoir.recipe b/recipes/ledevoir.recipe index bc473be181..830ac167ee 100644 --- a/recipes/ledevoir.recipe +++ b/recipes/ledevoir.recipe @@ -1,8 +1,8 @@ __license__ = 'GPL v3' -__author__ = 'Lorenzo Vigentini' -__copyright__ = '2009, Lorenzo Vigentini ' +__author__ = 'Lorenzo Vigentini and Olivier Daigle' +__copyright__ = '2012, Lorenzo Vigentini , Olivier Daigle ' __version__ = 'v1.01' -__date__ = '14, January 2010' +__date__ = '12, February 2012' __description__ = 'Canadian Paper ' ''' @@ -12,13 +12,14 @@ http://www.ledevoir.com/ import re from calibre.web.feeds.news import BasicNewsRecipe +from datetime import date class ledevoir(BasicNewsRecipe): author = 'Lorenzo Vigentini' description = 'Canadian Paper. A subscription is optional, with it you get more content' cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif' - title = u'Le Devoir' + title = u'Le Devoir ' publisher = 'leDevoir.com' category = 'News, finance, economy, politics' @@ -26,11 +27,15 @@ class ledevoir(BasicNewsRecipe): encoding = 'utf-8' timefmt = '[%a, %d %b, %Y]' - max_articles_per_feed = 50 + oldest_article = 1 + max_articles_per_feed = 200 use_embedded_content = False recursion = 10 needs_subscription = 'optional' + filterDuplicates = False + url_list = [] + remove_javascript = True no_stylesheets = True @@ -38,7 +43,7 @@ class ledevoir(BasicNewsRecipe): keep_only_tags = [ dict(name='div', attrs={'id':'article'}), - dict(name='ul', attrs={'id':'ariane'}) + dict(name='div', attrs={'id':'colonne_principale'}) ] remove_tags = [ @@ -51,7 +56,7 @@ class ledevoir(BasicNewsRecipe): feeds = [ (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'), - (u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), + (u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'), (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'), (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'), @@ -61,7 +66,7 @@ class ledevoir(BasicNewsRecipe): (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'), (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'), (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'), - (u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50') + (u'Art de vivre', 'http://www.ledevoir.com/rss/section/art-de-vivre.xml?id=50') ] extra_css = ''' @@ -85,8 +90,16 @@ class ledevoir(BasicNewsRecipe): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('http://www.ledevoir.com') - br.select_form(nr=1) - br['login[courriel]'] = self.username - br['login[password]'] = self.password + br.select_form(nr=0) + br['login_popup[courriel]'] = self.username + br['login_popup[password]'] = self.password br.submit() return br + + def print_version(self, url): + if self.filterDuplicates: + if url in self.url_list: + return + self.url_list.append(url) + return url + From afff6c814b24911756f43a697c17b1d2146f292e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 13 Feb 2012 10:15:30 +0530 Subject: [PATCH 15/29] Updated kurier --- recipes/kurier.recipe | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/recipes/kurier.recipe b/recipes/kurier.recipe index 21674f953f..8e300b68cd 100644 --- a/recipes/kurier.recipe +++ b/recipes/kurier.recipe @@ -13,9 +13,10 @@ class Kurier(BasicNewsRecipe): publisher = 'KURIER' category = 'news, politics, Austria' oldest_article = 2 - max_articles_per_feed = 200 + max_articles_per_feed = 100 + timeout = 30 + encoding = None no_stylesheets = True - encoding = 'cp1252' use_embedded_content = False language = 'de_AT' remove_empty_feeds = True @@ -29,9 +30,11 @@ class Kurier(BasicNewsRecipe): , 'language' : language } - remove_tags = [dict(attrs={'class':['functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})] + remove_tags = [ dict(attrs={'id':['artikel_expand_symbol2','imgzoom_close2']}), + dict(attrs={'class':['linkextern','functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']}) + ] keep_only_tags = [dict(attrs={'id':'content'})] - remove_tags_after = dict(attrs={'id':'author'}) + remove_tags_after = [dict(attrs={'id':'author'})] remove_attributes = ['width','height'] feeds = [ @@ -41,7 +44,7 @@ class Kurier(BasicNewsRecipe): ,(u'Kultur' , u'http://kurier.at/rss/kultur_kultur_rss.xml' ) ,(u'Freizeit' , u'http://kurier.at/rss/freizeit_freizeit_rss.xml' ) ,(u'Wetter' , u'http://kurier.at/rss/oewetter_rss.xml' ) - ,(u'Verkehr' , u'http://kurier.at/rss/verkehr_rss.xml' ) + ,(u'Sport' , u'http://kurier.at/newsfeed/detail/sport_rss.xml' ) ] def preprocess_html(self, soup): From 875a6a7325d92d09b313dfeb946309770e627bf0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 13 Feb 2012 10:32:43 +0530 Subject: [PATCH 16/29] Fix #931087 (New device) --- src/calibre/devices/android/driver.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 69eac39c41..bc3fea4dcb 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -75,6 +75,7 @@ class ANDROID(USBMS): # Google 0x18d1 : { 0x0001 : [0x0223, 0x9999], + 0x0003 : [0x0230], 0x4e11 : [0x0100, 0x226, 0x227], 0x4e12 : [0x0100, 0x226, 0x227], 0x4e21 : [0x0100, 0x226, 0x227, 0x231], @@ -167,7 +168,7 @@ class ANDROID(USBMS): 'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA', 'GENERIC-', 'ZTE', 'MID', 'QUALCOMM', 'PANDIGIT', 'HYSTON', 'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP', - 'POCKET'] + 'POCKET', 'ONDA_MID'] WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE', '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897', 'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', @@ -181,13 +182,14 @@ class ANDROID(USBMS): 'ALPANDIGITAL', 'ANDROID_MID', 'VTAB1008', 'EMX51_BBG_ANDROI', 'UMS', '.K080', 'P990', 'LTE', 'MB853', 'GT-S5660_CARD', 'A107', 'GT-I9003_CARD', 'XT912', 'FILE-CD_GADGET', 'RK29_SDK', 'MB855', - 'XT910', 'BOOK_A10'] + 'XT910', 'BOOK_A10', 'USB_2.0_DRIVER'] WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD', '__UMS_COMPOSITE', 'SGH-I997_CARD', 'MB870', 'ALPANDIGITAL', 'ANDROID_MID', 'P990_SD_CARD', '.K080', 'LTE_CARD', 'MB853', - 'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD'] + 'A1-07___C0541A4F', 'XT912', 'MB855', 'XT910', 'BOOK_A10_CARD', + 'USB_2.0_DRIVER'] OSX_MAIN_MEM = 'Android Device Main Memory' From d9105465b97aecb41571474b6abcc6e49b2a239a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 13 Feb 2012 11:21:19 +0530 Subject: [PATCH 17/29] Fix Samanyolu Haber --- recipes/icons/samanyolu_haber.png | Bin 0 -> 968 bytes recipes/samanyolu_haber.recipe | 11 ++++++----- 2 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 recipes/icons/samanyolu_haber.png diff --git a/recipes/icons/samanyolu_haber.png b/recipes/icons/samanyolu_haber.png new file mode 100644 index 0000000000000000000000000000000000000000..4bfd29abac7e3a0a9f7697ca330c25818e449ca1 GIT binary patch literal 968 zcmeAS@N?(olHy`uVBq!ia0vp^0w65F1|Id)X9bz8q~OPWCAQCg#h;CeM*A zn-J8O6(b|#^G{!yi(S{!L$}^bWVY~IqrfFABXfS&)U|)N`}-^D+s-}TYV6N{-ZTIF z=UTh_hF%+6B;?FyZi!k{5z{eGPU&3dnKY$yme*f)NF?isD0z7&zxgB-7^h!Rb?U~Y zOK)>kCh6FF>)Dv37)9#1Y?OF@`LauoAZxQgOl<3d>)t9Y0_)eiI0!_)fi zUDEf5j5U>>w-^=M+Io)tt6P*YrFc5`p)})`@89dW_;@6EUcTI5WTYU`zqsLUY{{a% zvcES69>_RJoqgZ9QR2q!?&V_Ln-p%|I_FnVD%!eX$#t36 zpN#_#9A%A&o_$7?Q(8JPHMNnOd%^wnU#9)O|M>H#gqL3*{H$Em(964F6<43D>;CB* z_weZHHMO@VI5%I)FA|a!cKW$zt?x8uIax!FRl@R1R%-{WW_>!L(k7wMFvFyYmG!}G z=^gv|6D@N-J=wW)Lyg^qw{7$9o}1FQbm;~+_k(?W8`kjLx#N-$FTXV6gnInZlbkaq zD_i&4Oyu}&?U<&v=2z8*Ju;1c&+iu&O?&p-@#oPu@7!t>6#fQ_EV!)6Jt5xaoMYsI zHv!q&8rD1a@%i~U{P_HNw_jKGr@d26O&9Fp4e~fQ;g9vV3WbXL`C9+&-#bi^?oEAA z2h19(C9V-ADTyViR>?)FK#IZ0z|ct7z)087IK Date: Tue, 14 Feb 2012 09:59:33 +0530 Subject: [PATCH 18/29] Novinite BG by M3 Web --- recipes/novinite_bg.recipe | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 recipes/novinite_bg.recipe diff --git a/recipes/novinite_bg.recipe b/recipes/novinite_bg.recipe new file mode 100644 index 0000000000..637a387760 --- /dev/null +++ b/recipes/novinite_bg.recipe @@ -0,0 +1,26 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1329123365(BasicNewsRecipe): + title = u'Novinite.bg' + __author__ = 'M3 Web' + description = 'Real time provider of the latest news from Bulgaria and the world' + category = 'Business, Politics, Society, Sports, Crime, Lifestyle, World, Health' + oldest_article = 7 + max_articles_per_feed = 6 + language = 'bg' + encoding = 'windows-1251' + no_stylesheets = True + remove_javascript = True + keep_only_tags = [dict(name='div', attrs={'id':'content'})] + remove_tags = [dict(name='div', attrs={'id':'text_options'})] + remove_tags = [dict(name='div', attrs={'id':'social_shares_top'})] + remove_tags_after = dict(id='textsize') + feeds = [(u'Business', u'http://novinite.bg/rss.php?category_id=1'), +(u'Politics', u'http://novinite.bg/rss.php?category_id=2'), +(u'Society', u'http://novinite.bg/rss.php?category_id=3'), +(u'Sport', u'http://novinite.bg/rss.php?category_id=4'), +(u'Crime', u'http://novinite.bg/rss.php?category_id=5'), +(u'Lifestyle', u'http://novinite.bg/rss.php?category_id=6'), +(u'Health', u'http://novinite.bg/rss.php?category_id=7'), +(u'Other', u'http://novinite.bg/rss.php?category_id=10'), +(u'World', u'http://novinite.bg/rss.php?category_id=9')] From 1deef8da06c341e211f477a7a9906b5bf5f61e5d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 14 Feb 2012 12:36:23 +0530 Subject: [PATCH 19/29] Albert Mohler, Desiring God, Living Stones and Resurgence by Peter Grungi --- recipes/albert_mohler.recipe | 18 ++++++++++++++++++ recipes/desiring_god.recipe | 21 +++++++++++++++++++++ recipes/living_stones.recipe | 25 +++++++++++++++++++++++++ recipes/resurgence.recipe | 20 ++++++++++++++++++++ 4 files changed, 84 insertions(+) create mode 100644 recipes/albert_mohler.recipe create mode 100644 recipes/desiring_god.recipe create mode 100644 recipes/living_stones.recipe create mode 100644 recipes/resurgence.recipe diff --git a/recipes/albert_mohler.recipe b/recipes/albert_mohler.recipe new file mode 100644 index 0000000000..fca16ccae9 --- /dev/null +++ b/recipes/albert_mohler.recipe @@ -0,0 +1,18 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Peter Grungi

' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AlbertMohlersBlog(BasicNewsRecipe): + title = u'Albert Mohler\'s Blog' + __author__ = 'Peter Grungi' + language = 'en' + oldest_article = 90 + max_articles_per_feed = 10 + auto_cleanup = True + cover_url = 'http://www.albertmohler.com/wp-content/themes/albert-mohler-v5/img/logo-am-lg.gif' + publisher = 'Albert Mohler' + language = 'en' + author = 'Albert Mohler' + + feeds = [(u'Albert Mohler\'s Blog', u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')] diff --git a/recipes/desiring_god.recipe b/recipes/desiring_god.recipe new file mode 100644 index 0000000000..42422f7226 --- /dev/null +++ b/recipes/desiring_god.recipe @@ -0,0 +1,21 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Peter Grungi

' + +from calibre.web.feeds.news import BasicNewsRecipe + + +class DesiringGodEnglish(BasicNewsRecipe): + title = u'Desiring God' + __author__ = 'Peter Grungi' + language = 'en' + + cover_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png' + masthead_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png' + language = 'en' + oldest_article = 7 + max_articles_per_feed = 50 + auto_cleanup = True + publisher = 'Desiring God Ministries' + author = 'Desiring God Ministries' + + feeds = [(u'Desiring God Blog', u'http://feeds.feedburner.com/DGBlog?format=xml')] diff --git a/recipes/living_stones.recipe b/recipes/living_stones.recipe new file mode 100644 index 0000000000..db5869f544 --- /dev/null +++ b/recipes/living_stones.recipe @@ -0,0 +1,25 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Peter Grungi

' + +from calibre.web.feeds.news import BasicNewsRecipe + +class LivingStonesPastorsBlog(BasicNewsRecipe): + title = u'Living Stones Pastors Blog' + __author__ = 'Peter Grungi' + language = 'en' + + oldest_article = 90 + max_articles_per_feed = 10 + auto_cleanup = True + cover_url = 'http://blogs.livingstonesreno.com/wp-content/uploads/2011/08/blogBGRD_norepeat.jpg' + masthead_url = 'http://www.livingstonesreno.com/podcast/LSpodcastnew.jpg' + publisher = 'Living Stones Church of Reno, NV' + language = 'en' + author = 'Living Stones Church of Reno, NV' + + feeds = [(u'LS Blog', u'http://blogs.livingstonesreno.com/feed?utm_source=calibre&utm_medium=rss')] + + def full_version(self, url): + import re + newurl = re.sub(r'\?.*','',url) + return newurl diff --git a/recipes/resurgence.recipe b/recipes/resurgence.recipe new file mode 100644 index 0000000000..46056def8f --- /dev/null +++ b/recipes/resurgence.recipe @@ -0,0 +1,20 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Peter Grungi

' + +from calibre.web.feeds.news import BasicNewsRecipe + +class TheResurgence(BasicNewsRecipe): + title = u'The Resurgence' + __author__ = 'Peter Grungi' + language = 'en' + + oldest_article = 7 + max_articles_per_feed = 10 + auto_cleanup = True + cover_url = 'http://cdn.theresurgence.com/images/logo.png' + masthead_url = 'http://cdn.theresurgence.com/images/logo.png' + language = 'en' + publisher = 'The Resurgence' + author = 'The Resurgence' + + feeds = [(u'The Resurgence', u'http://feeds.theresurgence.com/TheResurgence?format=xml')] From dd0dc01b7ee143855b83985e63e71629df2c86d9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 14 Feb 2012 13:25:24 +0530 Subject: [PATCH 20/29] Fix #930788 (Android Smartphone Sony Ericsson Xperia Neo not detected) --- src/calibre/devices/android/driver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index bc3fea4dcb..aeda2ab731 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -70,6 +70,7 @@ class ANDROID(USBMS): 0xd12e : [0x0100], 0xe14f : [0x0226], 0x614f : [0x0226, 0x100], + 0x6156 : [0x0226, 0x100], }, # Google From 20e46a53c628a1b619dca6badbf1e44e8bd5562e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 14 Feb 2012 19:58:50 +0530 Subject: [PATCH 21/29] ... --- setup/upload.py | 12 ++++-------- src/calibre/ebooks/pdf/pdftohtml.py | 7 ++++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/setup/upload.py b/setup/upload.py index d07a26b4d7..ffeb248b91 100644 --- a/setup/upload.py +++ b/setup/upload.py @@ -5,7 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, re, subprocess, hashlib, shutil, glob, stat, sys, time +import os, subprocess, hashlib, shutil, glob, stat, sys, time from subprocess import check_call from tempfile import NamedTemporaryFile, mkdtemp from zipfile import ZipFile @@ -64,15 +64,11 @@ class ReUpload(Command): # {{{ # Data {{{ def get_google_data(): - PASSWORD_FILE = os.path.expanduser('~/.googlecodecalibre') - OFFLINEIMAP = os.path.expanduser('~/work/kde/conf/offlineimap/rc') + with open(os.path.expanduser('~/.googlecodecalibre'), 'rb') as f: + gc_password, ga_un, pw = f.read().strip().split('|') - gc_password = open(PASSWORD_FILE).read().strip() - raw = open(OFFLINEIMAP).read() - pw = re.search(r'(?s)remoteuser = .*@gmail.com.*?remotepass = (\S+)', - raw).group(1).strip() return { - 'username':'kovidgoyal@gmail.com', 'password':pw, 'gc_password':gc_password, + 'username':ga_un, 'password':pw, 'gc_password':gc_password, 'path_map_server':'root@kovidgoyal.net', 'path_map_location':'/var/www/status.calibre-ebook.com/googlepaths', # If you change this remember to change it in the diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 9d81c73c2a..dbb3e5d2c2 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -69,11 +69,12 @@ def pdftohtml(output_dir, pdf_path, no_images): raise logf.flush() logf.close() - out = open(logf.name, 'rb').read() + out = open(logf.name, 'rb').read().strip() if ret != 0: raise ConversionError(out) - print "pdftohtml log:" - print out + if out: + print "pdftohtml log:" + print out if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() From 7895652c76ed918930c4843bcf3f4a9772fa11fe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 14 Feb 2012 20:14:35 +0530 Subject: [PATCH 22/29] When reading metadata from very large HTML files, do not search the entire file for the metadata, as this can be very slow --- src/calibre/ebooks/metadata/html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index a2ca76aa54..a8cc6c0eae 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -34,6 +34,7 @@ def get_metadata_(src, encoding=None): # Title title = None pat = re.compile(r'', re.DOTALL) + src = src[:150000] # Searching shouldn't take too long match = pat.search(src) if match: title = match.group(2) From 3563c895e1f8c7742075f5b3460bd8c0f68a2daf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 15 Feb 2012 08:33:18 +0530 Subject: [PATCH 23/29] Fix #932262 (HTML meta tags not read in case of multiple spaces) --- src/calibre/ebooks/metadata/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/html.py b/src/calibre/ebooks/metadata/html.py index a8cc6c0eae..03aa4cf32a 100644 --- a/src/calibre/ebooks/metadata/html.py +++ b/src/calibre/ebooks/metadata/html.py @@ -19,7 +19,7 @@ def get_metadata(stream): return get_metadata_(src) def get_meta_regexp_(name): - return re.compile('', re.IGNORECASE) + return re.compile('', re.IGNORECASE) def get_metadata_(src, encoding=None): if not isinstance(src, unicode): From 0596dd3d4ecf7d977ea5a02b71aaccbfaf2a7e19 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 15 Feb 2012 16:17:50 +0530 Subject: [PATCH 24/29] Fix regression that broke getting covers from some epub files on OS X. Fixes #932507 (recent update no longer pulling covers from dotepub files) --- src/calibre/ebooks/metadata/epub.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/metadata/epub.py b/src/calibre/ebooks/metadata/epub.py index 477b805ba0..c62f265633 100644 --- a/src/calibre/ebooks/metadata/epub.py +++ b/src/calibre/ebooks/metadata/epub.py @@ -160,6 +160,7 @@ def render_cover(opf, opf_path, zf, reader=None): with open(cpage, 'r+b') as f: raw = f.read() f.truncate(0) + f.seek(0) raw = ffpat.sub(b'', raw) f.write(raw) from calibre.ebooks.chardet import xml_to_unicode @@ -174,6 +175,7 @@ def render_cover(opf, opf_path, zf, reader=None): with open(path, 'r+b') as f: raw = f.read() f.truncate(0) + f.seek(0) raw = ffpat.sub(b'', raw) f.write(raw) From ea16f73299d835a3671f5f1808419c5b52b8f1d3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 15 Feb 2012 21:10:24 +0530 Subject: [PATCH 25/29] Fix #932790 (Remove recipe for dead serbian site Borba Online) --- recipes/borba.recipe | 95 -------------------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 recipes/borba.recipe diff --git a/recipes/borba.recipe b/recipes/borba.recipe deleted file mode 100644 index e6f3b4adc6..0000000000 --- a/recipes/borba.recipe +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' - -''' -borba.rs -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - -class Borba(BasicNewsRecipe): - title = 'Borba Online' - __author__ = 'Darko Miletic' - description = 'Dnevne novine Borba Online' - publisher = 'IP Novine Borba' - category = 'news, politics, Serbia' - language = 'sr' - - lang = _('sr-Latn-RS') - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = False - cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg' - INDEX = u'http://www.borba.rs/' - extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} ' - - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : lang - , 'pretty_print' : True - } - - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - keep_only_tags = [dict(name='div', attrs={'class':'main'})] - - remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'}) - - remove_tags = [ - dict(name=['object','link','iframe','base','img']) - ,dict(name='div',attrs={'id':'written_comments_title'}) - ] - - feeds = [ - (u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/') - ,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' ) - ,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' ) - ,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' ) - ,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' ) - ,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' ) - ,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' ) - ,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' ) - ,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/') - ] - - def preprocess_html(self, soup): - attribs = [ 'style','font','valign' - ,'colspan','width','height' - ,'rowspan','summary','align' - ,'cellspacing','cellpadding' - ,'frames','rules','border' - ] - for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): - item.name = 'div' - for attrib in attribs: - if item.has_key(attrib): - del item[attrib] - return soup - - def parse_index(self): - totalfeeds = [] - lfeeds = self.get_feeds() - for feedobj in lfeeds: - feedtitle, feedurl = feedobj - self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) - articles = [] - soup = self.index_to_soup(feedurl) - for item in soup.findAll('a', attrs={'class':'contentpagetitle'}): - url = item['href'] - title = self.tag_to_string(item) - articles.append({ - 'title' :title - ,'date' :'' - ,'url' :url - ,'description':'' - }) - totalfeeds.append((feedtitle, articles)) - return totalfeeds - From ebbddfe46496545b113cdeea7200b5495c5c0945 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 15 Feb 2012 22:35:53 +0530 Subject: [PATCH 26/29] Asian Review of Books by Darko Miletic. Fixes #932846 (new recipe for Asian review of books magazine) --- recipes/asianreviewofbooks.recipe | 51 +++++++++++++++++++++++++++ recipes/icons/asianreviewofbooks.png | Bin 0 -> 906 bytes src/calibre/utils/localization.py | 1 + 3 files changed, 52 insertions(+) create mode 100644 recipes/asianreviewofbooks.recipe create mode 100644 recipes/icons/asianreviewofbooks.png diff --git a/recipes/asianreviewofbooks.recipe b/recipes/asianreviewofbooks.recipe new file mode 100644 index 0000000000..f0912d8d98 --- /dev/null +++ b/recipes/asianreviewofbooks.recipe @@ -0,0 +1,51 @@ + +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +www.asianreviewofbooks.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AsianReviewOfBooks(BasicNewsRecipe): + title = 'The Asian Review of Books' + __author__ = 'Darko Miletic' + description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.' + publisher = 'The Asian Review of Books' + category = 'literature, books, reviews, Asia' + oldest_article = 30 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'cp1252' + language = 'en_HK' + publication_type = 'magazine' + masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png' + extra_css = """ + body{font-family: serif} + .big {font-size: xx-large} + .bold {font-weight: bold} + .italic {font-style: italic} + .small {font-size: small} + img {display: block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + + remove_tags = [dict(name=['object','script','iframe','embed'])] + remove_attributes = ['style', 'onclick'] + feeds = [(u'Articles' , u'http://www.asianreviewofbooks.com/new/rss.php')] + + def print_version(self, url): + root, sep, artid = url.rpartition('?ID=') + return root + 'getarticle.php?articleID=' + artid + '&stats=web' + + def preprocess_raw_html(self, raw, url): + return 'title' + raw + '' + \ No newline at end of file diff --git a/recipes/icons/asianreviewofbooks.png b/recipes/icons/asianreviewofbooks.png new file mode 100644 index 0000000000000000000000000000000000000000..31f6d42ce0d1206be6d1203340d798b43f541186 GIT binary patch literal 906 zcmV;519kj~P)$F;_l+q=+s4{MMa@SDTXO+u5Dh+Uf1o{ zN4-Zyr$unea4)YfUA$dIq(zX{kksnbMW{uA!+}GRLp7o`M!QBZu`n)_E_t|lTbWza z?b9r+EcE*HN|s7<%yUDNLwUVU5n zaVUT&darsamnz=x-h#w}DTpaYzenrz>s`BDDTFD4#ezhbL@T2!M3Y1)dniPlL`J$s zD1j&|rz>~LcSfs5My^IHoGV0?L`AMeMz2Oyrd52rd`7%RD1s;}hbvB(PDZ^(y5YGi zohwA1L_VfID10a=e<&-SD@2_{Ly<#rxpFF!Dn_(MLybdux_Q*=)JMKYDv>HEdMHM* zM(FnFpwys$%YRm*RyU?MSejTZr7bI(Dkym+ToG@Z&&moJa@P1+j6gallRw0qM$#h$jr+rCCh|sSbi{8AphQ z3;J%PDgj#l^&lE Date: Thu, 16 Feb 2012 09:18:59 +0530 Subject: [PATCH 27/29] ... --- src/calibre/gui2/auto_add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/auto_add.py b/src/calibre/gui2/auto_add.py index c49ca87bf6..74c3ec2bdd 100644 --- a/src/calibre/gui2/auto_add.py +++ b/src/calibre/gui2/auto_add.py @@ -26,7 +26,7 @@ class Worker(Thread): self.wake_up = Event() self.path, self.callback = path, callback self.staging = set() - self.be = frozenset(BOOK_EXTENSIONS) + self.be = frozenset(BOOK_EXTENSIONS) - {'pdr', 'mbp', 'tan'} def run(self): self.tdir = PersistentTemporaryDirectory('_auto_adder') From e961004b11962fcea93b9b85a9a8dfefd4dfad38 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 16 Feb 2012 14:04:51 +0530 Subject: [PATCH 28/29] Amazon metadata download: Support the new 'Book Description' section that Amazon publishes for some books. Also workaround the amazon US servers occassionally returning broken markup leading to calibre not finding any matches for books on Amazon. --- src/calibre/__init__.py | 12 +- src/calibre/ebooks/metadata/sources/amazon.py | 199 +++++++++++++----- src/calibre/ebooks/metadata/sources/base.py | 8 +- src/calibre/ebooks/metadata/sources/test.py | 13 +- src/calibre/library/comments.py | 1 + 5 files changed, 167 insertions(+), 66 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 82c2519e29..bc88d473fd 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -350,20 +350,20 @@ def get_proxy_info(proxy_scheme, proxy_string): USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13' USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' -def random_user_agent(): +def random_user_agent(choose=None): choices = [ 'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', - 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11', - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19', - 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11', + 'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0', + 'Mozilla/5.0 (Windows NT 6.2; rv:9.0.1) Gecko/20100101 Firefox/9.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.78 Safari/532.5', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', ] - #return choices[-1] - return choices[random.randint(0, len(choices)-1)] + if choose is None: + choose = random.randint(0, len(choices)-1) + return choices[choose] def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None): ''' diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 3d08b96c5f..cb724765f5 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -13,7 +13,7 @@ from threading import Thread from Queue import Queue, Empty -from calibre import as_unicode +from calibre import as_unicode, random_user_agent from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase, fixauthors) @@ -174,8 +174,8 @@ class Worker(Thread): # Get details {{{ def get_details(self): from calibre.utils.cleantext import clean_ascii_chars - from calibre.utils.soupparser import fromstring from calibre.ebooks.chardet import xml_to_unicode + import html5lib try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() @@ -202,7 +202,8 @@ class Worker(Thread): # Get details {{{ return try: - root = fromstring(clean_ascii_chars(raw)) + root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', + namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%self.url self.log.exception(msg) @@ -356,33 +357,46 @@ class Worker(Thread): # Get details {{{ if m is not None: return float(m.group(1))/float(m.group(3)) * 5 - def parse_comments(self, root): + def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html + for c in desc.xpath('descendant::noscript'): + c.getparent().remove(c) + for c in desc.xpath('descendant::*[@class="seeAll" or' + ' @class="emptyClear" or @id="collapsePS" or' + ' @id="expandPS"]'): + c.getparent().remove(c) + + for a in desc.xpath('descendant::a[@href]'): + del a.attrib['href'] + a.tag = 'span' + desc = self.tostring(desc, method='html', encoding=unicode).strip() + + # Encoding bug in Amazon data U+fffd (replacement char) + # in some examples it is present in place of ' + desc = desc.replace('\ufffd', "'") + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Collapse whitespace + #desc = re.sub('\n+', '\n', desc) + #desc = re.sub(' +', ' ', desc) + # Remove the notice about text referring to out of print editions + desc = re.sub(r'(?s)--This text ref.*?', '', desc) + # Remove comments + desc = re.sub(r'(?s)', '', desc) + return sanitize_comments_html(desc) + + + def parse_comments(self, root): + ans = '' + desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') + if desc: + ans = self._render_comments(desc[0]) + desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: - desc = desc[0] - for c in desc.xpath('descendant::*[@class="seeAll" or' - ' @class="emptyClear"]'): - c.getparent().remove(c) - for a in desc.xpath('descendant::a[@href]'): - del a.attrib['href'] - a.tag = 'span' - desc = self.tostring(desc, method='html', encoding=unicode).strip() - - # Encoding bug in Amazon data U+fffd (replacement char) - # in some examples it is present in place of ' - desc = desc.replace('\ufffd', "'") - # remove all attributes from tags - desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) - # Collapse whitespace - #desc = re.sub('\n+', '\n', desc) - #desc = re.sub(' +', ' ', desc) - # Remove the notice about text referring to out of print editions - desc = re.sub(r'(?s)--This text ref.*?', '', desc) - # Remove comments - desc = re.sub(r'(?s)', '', desc) - return sanitize_comments_html(desc) + ans += self._render_comments(desc[0]) + return ans def parse_cover(self, root): imgs = root.xpath('//img[@id="prodImage" and @src]') @@ -467,6 +481,28 @@ class Amazon(Source): Source.__init__(self, *args, **kwargs) self.set_amazon_id_touched_fields() + def test_fields(self, mi): + ''' + Return the first field from self.touched_fields that is null on the + mi object + ''' + for key in self.touched_fields: + if key.startswith('identifier:'): + key = key.partition(':')[-1] + if key == 'amazon': + if self.domain != 'com': + key += '_' + self.domain + if not mi.has_identifier(key): + return 'identifier: ' + key + elif mi.is_null(key): + return key + + @property + def user_agent(self): + # Pass in an index to random_user_agent() to test with a particular + # user agent + return random_user_agent() + def save_settings(self, *args, **kwargs): Source.save_settings(self, *args, **kwargs) self.set_amazon_id_touched_fields() @@ -507,6 +543,9 @@ class Amazon(Source): @property def domain(self): + x = getattr(self, 'testing_domain', None) + if x is not None: + return x domain = self.prefs['domain'] if domain not in self.AMAZON_DOMAINS: domain = 'com' @@ -599,16 +638,52 @@ class Amazon(Source): return url # }}} + def parse_results_page(self, root): # {{{ + from lxml.html import tostring + + matches = [] + + def title_ok(title): + title = title.lower() + for x in ('bulk pack', '[audiobook]', '[audio cd]'): + if x in title: + return False + return True + + for div in root.xpath(r'//div[starts-with(@id, "result_")]'): + for a in div.xpath(r'descendant::a[@class="title" and @href]'): + title = tostring(a, method='text', encoding=unicode) + if title_ok(title): + matches.append(a.get('href')) + break + + if not matches: + # This can happen for some user agents that Amazon thinks are + # mobile/less capable + for td in root.xpath( + r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'): + for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'): + title = tostring(a, method='text', encoding=unicode) + if title_ok(title): + matches.append(a.get('href')) + break + + + # Keep only the top 5 matches as the matches are sorted by relevance by + # Amazon so lower matches are not likely to be very relevant + return matches[:5] + # }}} + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' - from lxml.html import tostring from calibre.utils.cleantext import clean_ascii_chars - from calibre.utils.soupparser import fromstring from calibre.ebooks.chardet import xml_to_unicode + from lxml.html import tostring + import html5lib query, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers) @@ -616,6 +691,8 @@ class Amazon(Source): log.error('Insufficient metadata to construct query') return br = self.browser + if getattr(self, 'running_a_test', False): + print ('Using user agent for amazon: %s'%self.user_agent) try: raw = br.open_novisit(query, timeout=timeout).read().strip() except Exception as e: @@ -634,15 +711,23 @@ class Amazon(Source): return as_unicode(msg) - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] + raw = clean_ascii_chars(xml_to_unicode(raw, + strip_encoding_pats=True, resolve_entities=True)[0]) + + if getattr(self, 'running_a_test', False): + import tempfile + with tempfile.NamedTemporaryFile(prefix='amazon_results_', + suffix='.html', delete=False) as f: + f.write(raw.encode('utf-8')) + print ('Downloaded html for results page saved in', f.name) matches = [] found = '404 - ' not in raw if found: try: - root = fromstring(clean_ascii_chars(raw)) + root = html5lib.parse(raw, treebuilder='lxml', + namespaceHTMLElements=False) except: msg = 'Failed to parse amazon page for query: %r'%query log.exception(msg) @@ -655,30 +740,9 @@ class Amazon(Source): # The error is almost always a not found error found = False + if found: - for div in root.xpath(r'//div[starts-with(@id, "result_")]'): - for a in div.xpath(r'descendant::a[@class="title" and @href]'): - title = tostring(a, method='text', encoding=unicode).lower() - if 'bulk pack' not in title: - matches.append(a.get('href')) - break - if not matches: - # This can happen for some user agents that Amazon thinks are - # mobile/less capable - log('Trying alternate results page markup') - for td in root.xpath( - r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'): - for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'): - title = tostring(a, method='text', encoding=unicode).lower() - if ('bulk pack' not in title and '[audiobook]' not in - title and '[audio cd]' not in title): - matches.append(a.get('href')) - break - - - # Keep only the top 5 matches as the matches are sorted by relevance by - # Amazon so lower matches are not likely to be very relevant - matches = matches[:5] + matches = self.parse_results_page(root) if abort.is_set(): return @@ -686,7 +750,7 @@ class Amazon(Source): if not matches: if identifiers and title and authors: log('No matches found with identifiers, retrying using only' - ' title and authors') + ' title and authors. Query: %r'%query) return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r'%query) @@ -756,9 +820,18 @@ if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e # src/calibre/ebooks/metadata/sources/amazon.py from calibre.ebooks.metadata.sources.test import (test_identify_plugin, - isbn_test, title_test, authors_test) + isbn_test, title_test, authors_test, comments_test) com_tests = [ # {{{ + ( # Different comments markup, using Book Description section + {'identifiers':{'amazon':'0982514506'}}, + [title_test( + "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy" + , exact=True), + comments_test('Jelena'), comments_test('Leslie'), + ] + ), + ( # # in title {'title':'Expert C# 2008 Business Objects', 'authors':['Lhotka']}, @@ -850,7 +923,17 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - test_identify_plugin(Amazon.name, com_tests) - #test_identify_plugin(Amazon.name, de_tests) + def do_test(domain, start=0, stop=None): + tests = globals().get(domain+'_tests') + if stop is None: + stop = len(tests) + tests = tests[start:stop] + test_identify_plugin(Amazon.name, tests, modify_plugin=lambda + p:setattr(p, 'testing_domain', domain)) + + do_test('com') + + #do_test('de') + # }}} diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 4c334f4e46..4408bff6c6 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -253,10 +253,16 @@ class Source(Plugin): # Browser {{{ + @property + def user_agent(self): + # Pass in an index to random_user_agent() to test with a particular + # user agent + return random_user_agent() + @property def browser(self): if self._browser is None: - self._browser = browser(user_agent=random_user_agent()) + self._browser = browser(user_agent=self.user_agent) if self.supports_gzip_transfer_encoding: self._browser.set_handle_gzip(True) return self._browser.clone_browser() diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py index bccce3dba2..4853035b27 100644 --- a/src/calibre/ebooks/metadata/sources/test.py +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -84,6 +84,16 @@ def series_test(series, series_index): return test +def comments_test(sentinel): + + def test(mi): + comm = mi.comments.lower() if mi.comments else '' + if sentinel and sentinel.lower() in comm: + return True + prints('comments test failed. %s not in comments'%sentinel) + return False + return test + def init_test(tdir_name): tdir = tempfile.gettempdir() lf = os.path.join(tdir, tdir_name.replace(' ', '')+'_identify_test.txt') @@ -157,7 +167,7 @@ def test_identify(tests): # {{{ # }}} -def test_identify_plugin(name, tests): # {{{ +def test_identify_plugin(name, tests, modify_plugin=lambda plugin:None): # {{{ ''' :param name: Plugin name :param tests: List of 2-tuples. Each two tuple is of the form (args, @@ -171,6 +181,7 @@ def test_identify_plugin(name, tests): # {{{ if x.name == name: plugin = x break + modify_plugin(plugin) prints('Testing the identify function of', plugin.name) prints('Using extra headers:', plugin.browser.addheaders) diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index 5d6c43f343..0f45a47032 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -136,6 +136,7 @@ def sanitize_comments_html(html): text = html2text(html) md = markdown.Markdown(safe_mode=True) cleansed = re.sub('\n+', '', md.convert(text)) + cleansed = cleansed.replace(markdown.HTML_REMOVED_TEXT, '') return cleansed def test(): From d0ab0d6fcedb3d5d8c446cf7802e37bcee8fc48c Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Thu, 16 Feb 2012 14:48:57 +0530 Subject: [PATCH 29/29] Add a function to return the current library name --- src/calibre/library/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/calibre/library/__init__.py b/src/calibre/library/__init__.py index 84a7acbc73..605d062de3 100644 --- a/src/calibre/library/__init__.py +++ b/src/calibre/library/__init__.py @@ -61,4 +61,13 @@ def generate_test_db(library_path, # {{{ print 'Time per record:', t/float(num_of_records) # }}} +def current_library_name(): + from calibre.utils.config import prefs + import posixpath + path = prefs['library_path'] + if path: + path = path.replace('\\', '/') + while path.endswith('/'): + path = path[:-1] + return posixpath.basename(path)