diff --git a/resources/recipes/ajc.recipe b/resources/recipes/ajc.recipe index ccd0efebdd..ea989b4b4c 100644 --- a/resources/recipes/ajc.recipe +++ b/resources/recipes/ajc.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__author__ = 'Tony Stegall' +__author__ = 'Tony Stegall' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __version__ = '1.03' __date__ = '27, September 2010' @@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en' import datetime +from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1282101454(BasicNewsRecipe): now = datetime.datetime.now() title = 'The AJC' @@ -20,39 +22,39 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True - + masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif' extra_css = ''' h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - + p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} - - + + p{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' - - + + keep_only_tags = [ dict(name='div', attrs={'class':['cxArticleHeader']}) ,dict(attrs={'id':['cxArticleText']}) ] - - + + remove_tags = [ dict(name='div' , attrs={'class':'cxArticleList' }) ,dict(name='div' , attrs={'class':'cxFeedTease' }) ,dict(name='div' , attrs={'class':'cxElementEnlarge' }) ,dict(name='div' , attrs={'id':'cxArticleTools' }) ] - - - + + + feeds = [ ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'), # ------------------------------------------------------------------- - # Here are the different area feeds. Choose which ever one you wish to + # Here are the different area feeds. Choose which ever one you wish to # read by simply removing the pound sign from it. I currently have it # set to only get the Cobb area # -------------------------------------------------------------------- @@ -70,7 +72,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): ('Opinions', 'http://www.ajc.com/section-rss.do?source=opinion'), ('Ga Politics', 'http://www.ajc.com/section-rss.do?source=georgia-politics-elections'), # ------------------------------------------------------------------------ - # Here are the different sports feeds. I only follow the Falcons, and Highschool + # Here are the different sports feeds. I only follow the Falcons, and Highschool # but again # You can enable which ever team you like by removing the pound sign # ------------------------------------------------------------------------ @@ -85,25 +87,25 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'), ] - + def postprocess_html(self, soup, first): for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}): credit_tag.extract() - + return soup - + #def print_version(self, url): # return url.partition('?')[0] +'?printArticle=y' - - - - - - + + + + + + diff --git a/resources/recipes/boortz.recipe b/resources/recipes/boortz.recipe index dfb624c4bc..b281798ac8 100644 --- a/resources/recipes/boortz.recipe +++ b/resources/recipes/boortz.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__author__ = 'Tony Stegall' +__author__ = 'Tony Stegall' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __version__ = '1.04' __date__ = '27, September 2010' @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, re + class AdvancedUserRecipe1282101454(BasicNewsRecipe): title = 'Nealz Nuze' language = 'en' @@ -18,7 +18,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): category = 'news, politics, USA, talkshow' oldest_article = 1 max_articles_per_feed = 100 - + no_stylesheets = True remove_javascript = True use_embedded_content = True @@ -26,5 +26,5 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): conversion_options = {'linearize_tables' : True} feeds = [ ('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml') - + ] diff --git a/resources/recipes/popscience.recipe b/resources/recipes/popscience.recipe index 2bef7e4807..5f66d048a6 100644 --- a/resources/recipes/popscience.recipe +++ b/resources/recipes/popscience.recipe @@ -1,5 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, re +import re class AdvancedUserRecipe1282101454(BasicNewsRecipe): title = 'Popular Science' @@ -13,35 +13,35 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): no_stylesheets = True remove_javascript = True use_embedded_content = True - + masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg' - - + + feeds = [ - + ('Gadgets', 'http://www.popsci.com/full-feed/gadgets'), ('Cars', 'http://www.popsci.com/full-feed/cars'), ('Science', 'http://www.popsci.com/full-feed/science'), ('Technology', 'http://www.popsci.com/full-feed/technology'), ('DIY', 'http://www.popsci.com/full-feed/diy'), - + ] - - #The following will get read of the Gallery: links when found - + + #The following will get read of the Gallery: links when found + def preprocess_html(self, soup) : print 'SOUP IS: ', soup weblinks = soup.findAll(['head','h2']) if weblinks is not None: for link in weblinks: if re.search('(Gallery)(:)',str(link)): - + link.parent.extract() return soup - #----------------------------------------------------------------- - - + #----------------------------------------------------------------- + + diff --git a/resources/recipes/telegraph_uk.recipe b/resources/recipes/telegraph_uk.recipe index 2c261987b2..f79f0fa50c 100644 --- a/resources/recipes/telegraph_uk.recipe +++ b/resources/recipes/telegraph_uk.recipe @@ -1,6 +1,5 @@ -#!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' telegraph.co.uk ''' @@ -8,14 +7,16 @@ telegraph.co.uk from calibre.web.feeds.news import BasicNewsRecipe class TelegraphUK(BasicNewsRecipe): - title = u'Telegraph.co.uk' + title = 'Telegraph.co.uk' __author__ = 'Darko Miletic and Sujata Raman' description = 'News from United Kingdom' - oldest_article = 7 + oldest_article = 2 + category = 'news, politics, UK' + publisher = 'Telegraph Media Group ltd.' max_articles_per_feed = 100 no_stylesheets = True - language = 'en' - + language = 'en_GB' + remove_empty_feeds = True use_embedded_content = False extra_css = ''' @@ -27,13 +28,20 @@ class TelegraphUK(BasicNewsRecipe): .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} ''' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = [ - dict(name='div', attrs={'class':'storyHead'}) - ,dict(name='div', attrs={'class':'story' }) - #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] }) + dict(name='div', attrs={'class':['storyHead','byline']}) + ,dict(name='div', attrs={'id':'mainBodyArea' }) ] - remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']}) - #,dict(name='div', attrs={'class':['toolshideoneQuarter']}) + remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']}) + ,dict(name='ul' , attrs={'class':['shareThis shareBottom']}) ,dict(name='span', attrs={'class':['num','placeComment']}) ] @@ -51,24 +59,7 @@ class TelegraphUK(BasicNewsRecipe): ] def get_article_url(self, article): - - url = article.get('guid', None) - + url = article.get('link', None) if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url : url = None - return url - - - def postprocess_html(self,soup,first): - - for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}): - for pTag in bylineTag.findAll(name='p'): - if getattr(pTag.contents[0],"Comments",True): - pTag.extract() - return soup - - - - - diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ec9f7e2bc2..5fd51de38c 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -469,14 +469,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ - LibraryThingCovers + LibraryThingCovers, DoubanCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers] + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 5fb14988a5..0310f09242 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -251,7 +251,7 @@ class OutputProfile(Plugin): #: The character used to represent a star in ratings ratings_char = u'*' - + #: Unsupported unicode characters to be replaced during preprocessing unsupported_unicode_chars = [] diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 231cc0e225..844269e453 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Douban Books', + 'Douban Books', 'Douban.com covers', ]) def is_disabled(plugin): diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 2e02a1b90e..bb5c26a50c 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -61,7 +61,7 @@ def wrap_lines(match): return ' ' else: return ital+' ' - + class DocAnalysis(object): ''' Provides various text analysis functions to determine how the document is structured. @@ -79,7 +79,7 @@ class DocAnalysis(object): elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) self.lines = linere.findall(raw) - + def line_length(self, percent): ''' Analyses the document to find the median line length. @@ -114,7 +114,7 @@ class DocAnalysis(object): index = int(len(lengths) * percent) - 1 return lengths[index] - + def line_histogram(self, percent): ''' Creates a broad histogram of the document to determine whether it incorporates hard @@ -147,14 +147,12 @@ class DocAnalysis(object): h = [ float(count)/totalLines for count in hRaw ] #print "\nhRaw histogram lengths are: "+str(hRaw) #print " percents are: "+str(h)+"\n" - + # Find the biggest bucket maxValue = 0 - peakPosition = 0 for i in range(0,len(h)): if h[i] > maxValue: maxValue = h[i] - peakPosition = i if maxValue < percent: #print "Line lengths are too variable. Not unwrapping." @@ -195,7 +193,7 @@ class Dehyphenator(object): try: searchresult = self.html.find(str.lower(lookupword)) except: - return hyphenated + return hyphenated if self.format == 'html_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) @@ -206,7 +204,7 @@ class Dehyphenator(object): else: #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) return firsthalf+u'\u2014'+wraptags+secondhalf - + else: if self.html.find(lookupword) != -1 or searchresult != -1: #print "returned dehyphenated word: " + str(dehyphenated) @@ -533,12 +531,12 @@ class HTMLPreProcessor(object): html = self.smarten_punctuation(html) unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars - if unsupported_unicode_chars != []: + if unsupported_unicode_chars: from calibre.ebooks.unidecode.unidecoder import Unidecoder unidecoder = Unidecoder() for char in unsupported_unicode_chars: asciichar = unidecoder.decode(char) - html = re.sub(u'%s' % char, asciichar, html) + html = html.replace(char, asciichar) return html diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index b6969a3659..5f5c12a703 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -81,7 +81,7 @@ class PreProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*

\s*", "\n

", html) - + ###### Check Markup ###### # # some lit files don't have any

tags or equivalent (generally just plain text between @@ -129,6 +129,7 @@ class PreProcessor(object): #multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

){2,}', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) + blanks_between_paragraphs = False if len(lines) > 1: self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, @@ -140,7 +141,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") + #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces @@ -159,14 +160,14 @@ class PreProcessor(object): title_header_close = ")\s*" title_line_close = "()?\s*()?\s*(]*>)?\s*" opt_title_close = ")?" - + default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)" typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" - + chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - #print chapter_marker + #print chapter_marker heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") @@ -202,7 +203,7 @@ class PreProcessor(object): format = 'html' else: format = 'html' - # Check Line histogram to determine if the document uses hard line breaks, If 50% or + # Check Line histogram to determine if the document uses hard line breaks, If 50% or # more of the lines break in the same region of the document then unwrapping is required docanalysis = DocAnalysis(format, html) hardbreaks = docanalysis.line_histogram(.50) @@ -233,7 +234,7 @@ class PreProcessor(object): dehyphenator = Dehyphenator() html = dehyphenator(html,'html_cleanup', length) self.log("Done dehyphenating") - + # delete soft hyphens html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index b05444c1c6..2f6fb46540 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -9,6 +9,7 @@ import traceback, socket, re, sys from functools import partial from threading import Thread, Event from Queue import Queue, Empty +from lxml import etree import mechanize @@ -216,6 +217,68 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{ # }}} +class DoubanCovers(CoverDownload): # {{{ + 'Download covers from Douban.com' + + DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/' + CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' + name = 'Douban.com covers' + description = _('Download covers from Douban.com') + author = 'Li Fanxi' + + def get_cover_url(self, isbn, br, timeout=5.): + try: + url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY + src = br.open(url, timeout=timeout).read() + except Exception, err: + if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + err = Exception(_('Douban.com API timed out. Try again later.')) + raise err + else: + feed = etree.fromstring(src) + NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'db': 'http://www.douban.com/xmlns/' + } + XPath = partial(etree.XPath, namespaces=NAMESPACES) + entries = XPath('//atom:entry')(feed) + if len(entries) < 1: + return None + try: + cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") + u = cover_url(entries[0])[0].replace('/spic/', '/lpic/'); + # If URL contains "book-default", the book doesn't have a cover + if u.find('book-default') != -1: + return None + except: + return None + return u + + def has_cover(self, mi, ans, timeout=5.): + if not mi.isbn: + return False + br = browser() + try: + if self.get_cover_url(mi.isbn, br, timeout=timeout) != None: + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + br = browser() + try: + url = self.get_cover_url(mi.isbn, br, timeout=timeout) + cover_data = br.open_novisit(url).read() + result_queue.put((True, cover_data, 'jpg', self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) +# }}} + def download_cover(mi, timeout=5.): # {{{ results = Queue() download_covers(mi, results, max_covers=1, timeout=timeout) diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index f3234d48d5..ef58ec3a90 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -584,12 +584,42 @@ class LibraryPage(QWizardPage, LibraryUI): qt_app.load_translations() self.emit(SIGNAL('retranslate()')) self.init_languages() + try: + if prefs['language'].lower().startswith('zh'): + from calibre.customize.ui import enable_plugin + for name in ('Douban Books', 'Douban.com covers'): + enable_plugin(name) + except: + pass + + def is_library_dir_suitable(self, x): + return LibraryDatabase2.exists_at(x) or not os.listdir(x) + + def validatePage(self): + newloc = unicode(self.location.text()) + if not self.is_library_dir_suitable(newloc): + self.show_library_dir_error(newloc) + return False + return True def change(self): - dir = choose_dir(self, 'database location dialog', + x = choose_dir(self, 'database location dialog', _('Select location for books')) - if dir: - self.location.setText(dir) + if x: + if self.is_library_dir_suitable(x): + self.location.setText(x) + else: + self.show_library_dir_error(x) + + def show_library_dir_error(self, x): + if not isinstance(x, unicode): + try: + x = x.decode(filesystem_encoding) + except: + x = unicode(repr(x)) + error_dialog(self, _('Bad location'), + _('You must choose an empty folder for ' + 'the calibre library. %s is not empty.')%x, show=True) def initializePage(self): lp = prefs['library_path']