merge from trunk

This commit is contained in:
ldolse 2010-09-29 07:18:30 +08:00
commit 81027bcff9
11 changed files with 178 additions and 93 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Tony Stegall' __author__ = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__ = '1.03' __version__ = '1.03'
__date__ = '27, September 2010' __date__ = '27, September 2010'
@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
import datetime import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
now = datetime.datetime.now() now = datetime.datetime.now()
title = 'The AJC' title = 'The AJC'
@ -20,39 +22,39 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif' masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
extra_css = ''' extra_css = '''
h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
p{font-family:Helvetica,Arial,sans-serif;font-size:small;} p{font-family:Helvetica,Arial,sans-serif;font-size:small;}
''' '''
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':['cxArticleHeader']}) dict(name='div', attrs={'class':['cxArticleHeader']})
,dict(attrs={'id':['cxArticleText']}) ,dict(attrs={'id':['cxArticleText']})
] ]
remove_tags = [ remove_tags = [
dict(name='div' , attrs={'class':'cxArticleList' }) dict(name='div' , attrs={'class':'cxArticleList' })
,dict(name='div' , attrs={'class':'cxFeedTease' }) ,dict(name='div' , attrs={'class':'cxFeedTease' })
,dict(name='div' , attrs={'class':'cxElementEnlarge' }) ,dict(name='div' , attrs={'class':'cxElementEnlarge' })
,dict(name='div' , attrs={'id':'cxArticleTools' }) ,dict(name='div' , attrs={'id':'cxArticleTools' })
] ]
feeds = [ feeds = [
('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'), ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'),
# ------------------------------------------------------------------- # -------------------------------------------------------------------
# Here are the different area feeds. Choose which ever one you wish to # Here are the different area feeds. Choose which ever one you wish to
# read by simply removing the pound sign from it. I currently have it # read by simply removing the pound sign from it. I currently have it
# set to only get the Cobb area # set to only get the Cobb area
# -------------------------------------------------------------------- # --------------------------------------------------------------------
@ -70,7 +72,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
('Opinions', 'http://www.ajc.com/section-rss.do?source=opinion'), ('Opinions', 'http://www.ajc.com/section-rss.do?source=opinion'),
('Ga Politics', 'http://www.ajc.com/section-rss.do?source=georgia-politics-elections'), ('Ga Politics', 'http://www.ajc.com/section-rss.do?source=georgia-politics-elections'),
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
# Here are the different sports feeds. I only follow the Falcons, and Highschool # Here are the different sports feeds. I only follow the Falcons, and Highschool
# but again # but again
# You can enable which ever team you like by removing the pound sign # You can enable which ever team you like by removing the pound sign
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
@ -85,25 +87,25 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'), ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
] ]
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}): for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
credit_tag.extract() credit_tag.extract()
return soup return soup
#def print_version(self, url): #def print_version(self, url):
# return url.partition('?')[0] +'?printArticle=y' # return url.partition('?')[0] +'?printArticle=y'

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Tony Stegall' __author__ = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__ = '1.04' __version__ = '1.04'
__date__ = '27, September 2010' __date__ = '27, September 2010'
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Nealz Nuze' title = 'Nealz Nuze'
language = 'en' language = 'en'
@ -18,7 +18,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
category = 'news, politics, USA, talkshow' category = 'news, politics, USA, talkshow'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
use_embedded_content = True use_embedded_content = True
@ -26,5 +26,5 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
conversion_options = {'linearize_tables' : True} conversion_options = {'linearize_tables' : True}
feeds = [ feeds = [
('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml') ('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml')
] ]

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re import re
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Popular Science' title = 'Popular Science'
@ -13,35 +13,35 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
use_embedded_content = True use_embedded_content = True
masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg' masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg'
feeds = [ feeds = [
('Gadgets', 'http://www.popsci.com/full-feed/gadgets'), ('Gadgets', 'http://www.popsci.com/full-feed/gadgets'),
('Cars', 'http://www.popsci.com/full-feed/cars'), ('Cars', 'http://www.popsci.com/full-feed/cars'),
('Science', 'http://www.popsci.com/full-feed/science'), ('Science', 'http://www.popsci.com/full-feed/science'),
('Technology', 'http://www.popsci.com/full-feed/technology'), ('Technology', 'http://www.popsci.com/full-feed/technology'),
('DIY', 'http://www.popsci.com/full-feed/diy'), ('DIY', 'http://www.popsci.com/full-feed/diy'),
] ]
#The following will get read of the Gallery: links when found #The following will get read of the Gallery: links when found
def preprocess_html(self, soup) : def preprocess_html(self, soup) :
print 'SOUP IS: ', soup print 'SOUP IS: ', soup
weblinks = soup.findAll(['head','h2']) weblinks = soup.findAll(['head','h2'])
if weblinks is not None: if weblinks is not None:
for link in weblinks: for link in weblinks:
if re.search('(Gallery)(:)',str(link)): if re.search('(Gallery)(:)',str(link)):
link.parent.extract() link.parent.extract()
return soup return soup
#----------------------------------------------------------------- #-----------------------------------------------------------------

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
telegraph.co.uk telegraph.co.uk
''' '''
@ -8,14 +7,16 @@ telegraph.co.uk
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class TelegraphUK(BasicNewsRecipe): class TelegraphUK(BasicNewsRecipe):
title = u'Telegraph.co.uk' title = 'Telegraph.co.uk'
__author__ = 'Darko Miletic and Sujata Raman' __author__ = 'Darko Miletic and Sujata Raman'
description = 'News from United Kingdom' description = 'News from United Kingdom'
oldest_article = 7 oldest_article = 2
category = 'news, politics, UK'
publisher = 'Telegraph Media Group ltd.'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
language = 'en' language = 'en_GB'
remove_empty_feeds = True
use_embedded_content = False use_embedded_content = False
extra_css = ''' extra_css = '''
@ -27,13 +28,20 @@ class TelegraphUK(BasicNewsRecipe):
.imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
''' '''
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'storyHead'}) dict(name='div', attrs={'class':['storyHead','byline']})
,dict(name='div', attrs={'class':'story' }) ,dict(name='div', attrs={'id':'mainBodyArea' })
#,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] })
] ]
remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']}) remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']})
#,dict(name='div', attrs={'class':['toolshideoneQuarter']}) ,dict(name='ul' , attrs={'class':['shareThis shareBottom']})
,dict(name='span', attrs={'class':['num','placeComment']}) ,dict(name='span', attrs={'class':['num','placeComment']})
] ]
@ -51,24 +59,7 @@ class TelegraphUK(BasicNewsRecipe):
] ]
def get_article_url(self, article): def get_article_url(self, article):
url = article.get('link', None)
url = article.get('guid', None)
if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url : if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url :
url = None url = None
return url return url
def postprocess_html(self,soup,first):
for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}):
for pTag in bylineTag.findAll(name='p'):
if getattr(pTag.contents[0],"Comments",True):
pTag.extract()
return soup

View File

@ -469,14 +469,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
LibraryThing LibraryThing
from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
LibraryThingCovers LibraryThingCovers, DoubanCovers
from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, OpenLibraryCovers, LibraryThingCovers] Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
plugins += [ plugins += [
ComicInput, ComicInput,
EPUBInput, EPUBInput,

View File

@ -251,7 +251,7 @@ class OutputProfile(Plugin):
#: The character used to represent a star in ratings #: The character used to represent a star in ratings
ratings_char = u'*' ratings_char = u'*'
#: Unsupported unicode characters to be replaced during preprocessing #: Unsupported unicode characters to be replaced during preprocessing
unsupported_unicode_chars = [] unsupported_unicode_chars = []

View File

@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name):
config['enabled_plugins'] = ep config['enabled_plugins'] = ep
default_disabled_plugins = set([ default_disabled_plugins = set([
'Douban Books', 'Douban Books', 'Douban.com covers',
]) ])
def is_disabled(plugin): def is_disabled(plugin):

View File

@ -61,7 +61,7 @@ def wrap_lines(match):
return ' ' return ' '
else: else:
return ital+' ' return ital+' '
class DocAnalysis(object): class DocAnalysis(object):
''' '''
Provides various text analysis functions to determine how the document is structured. Provides various text analysis functions to determine how the document is structured.
@ -79,7 +79,7 @@ class DocAnalysis(object):
elif format == 'spanned_html': elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
self.lines = linere.findall(raw) self.lines = linere.findall(raw)
def line_length(self, percent): def line_length(self, percent):
''' '''
Analyses the document to find the median line length. Analyses the document to find the median line length.
@ -114,7 +114,7 @@ class DocAnalysis(object):
index = int(len(lengths) * percent) - 1 index = int(len(lengths) * percent) - 1
return lengths[index] return lengths[index]
def line_histogram(self, percent): def line_histogram(self, percent):
''' '''
Creates a broad histogram of the document to determine whether it incorporates hard Creates a broad histogram of the document to determine whether it incorporates hard
@ -147,14 +147,12 @@ class DocAnalysis(object):
h = [ float(count)/totalLines for count in hRaw ] h = [ float(count)/totalLines for count in hRaw ]
#print "\nhRaw histogram lengths are: "+str(hRaw) #print "\nhRaw histogram lengths are: "+str(hRaw)
#print " percents are: "+str(h)+"\n" #print " percents are: "+str(h)+"\n"
# Find the biggest bucket # Find the biggest bucket
maxValue = 0 maxValue = 0
peakPosition = 0
for i in range(0,len(h)): for i in range(0,len(h)):
if h[i] > maxValue: if h[i] > maxValue:
maxValue = h[i] maxValue = h[i]
peakPosition = i
if maxValue < percent: if maxValue < percent:
#print "Line lengths are too variable. Not unwrapping." #print "Line lengths are too variable. Not unwrapping."
@ -195,7 +193,7 @@ class Dehyphenator(object):
try: try:
searchresult = self.html.find(str.lower(lookupword)) searchresult = self.html.find(str.lower(lookupword))
except: except:
return hyphenated return hyphenated
if self.format == 'html_cleanup': if self.format == 'html_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1: if self.html.find(lookupword) != -1 or searchresult != -1:
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated) #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
@ -206,7 +204,7 @@ class Dehyphenator(object):
else: else:
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf return firsthalf+u'\u2014'+wraptags+secondhalf
else: else:
if self.html.find(lookupword) != -1 or searchresult != -1: if self.html.find(lookupword) != -1 or searchresult != -1:
#print "returned dehyphenated word: " + str(dehyphenated) #print "returned dehyphenated word: " + str(dehyphenated)
@ -533,12 +531,12 @@ class HTMLPreProcessor(object):
html = self.smarten_punctuation(html) html = self.smarten_punctuation(html)
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars != []: if unsupported_unicode_chars:
from calibre.ebooks.unidecode.unidecoder import Unidecoder from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder() unidecoder = Unidecoder()
for char in unsupported_unicode_chars: for char in unsupported_unicode_chars:
asciichar = unidecoder.decode(char) asciichar = unidecoder.decode(char)
html = re.sub(u'%s' % char, asciichar, html) html = html.replace(char, asciichar)
return html return html

View File

@ -81,7 +81,7 @@ class PreProcessor(object):
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html) html = re.sub(r"\s*</p>", "</p>\n", html)
html = re.sub(r"\s*<p>\s*", "\n<p>", html) html = re.sub(r"\s*<p>\s*", "\n<p>", html)
###### Check Markup ###### ###### Check Markup ######
# #
# some lit files don't have any <p> tags or equivalent (generally just plain text between # some lit files don't have any <p> tags or equivalent (generally just plain text between
@ -129,6 +129,7 @@ class PreProcessor(object):
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE) #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
blanklines = blankreg.findall(html) blanklines = blankreg.findall(html)
lines = linereg.findall(html) lines = linereg.findall(html)
blanks_between_paragraphs = False
if len(lines) > 1: if len(lines) > 1:
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
@ -140,7 +141,7 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True" #print "blanks between paragraphs is marked True"
else: else:
blanks_between_paragraphs = False blanks_between_paragraphs = False
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
# #
# Build the Regular Expressions in pieces # Build the Regular Expressions in pieces
@ -159,14 +160,14 @@ class PreProcessor(object):
title_header_close = ")\s*" title_header_close = ")\s*"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>" title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
opt_title_close = ")?" opt_title_close = ")?"
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)" default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
#print chapter_marker #print chapter_marker
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html)) self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
@ -202,7 +203,7 @@ class PreProcessor(object):
format = 'html' format = 'html'
else: else:
format = 'html' format = 'html'
# Check Line histogram to determine if the document uses hard line breaks, If 50% or # Check Line histogram to determine if the document uses hard line breaks, If 50% or
# more of the lines break in the same region of the document then unwrapping is required # more of the lines break in the same region of the document then unwrapping is required
docanalysis = DocAnalysis(format, html) docanalysis = DocAnalysis(format, html)
hardbreaks = docanalysis.line_histogram(.50) hardbreaks = docanalysis.line_histogram(.50)
@ -233,7 +234,7 @@ class PreProcessor(object):
dehyphenator = Dehyphenator() dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length) html = dehyphenator(html,'html_cleanup', length)
self.log("Done dehyphenating") self.log("Done dehyphenating")
# delete soft hyphens # delete soft hyphens
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)

View File

@ -9,6 +9,7 @@ import traceback, socket, re, sys
from functools import partial from functools import partial
from threading import Thread, Event from threading import Thread, Event
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml import etree
import mechanize import mechanize
@ -216,6 +217,68 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{
# }}} # }}}
class DoubanCovers(CoverDownload): # {{{
'Download covers from Douban.com'
DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
name = 'Douban.com covers'
description = _('Download covers from Douban.com')
author = 'Li Fanxi'
def get_cover_url(self, isbn, br, timeout=5.):
try:
url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY
src = br.open(url, timeout=timeout).read()
except Exception, err:
if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
err = Exception(_('Douban.com API timed out. Try again later.'))
raise err
else:
feed = etree.fromstring(src)
NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom',
'db': 'http://www.douban.com/xmlns/'
}
XPath = partial(etree.XPath, namespaces=NAMESPACES)
entries = XPath('//atom:entry')(feed)
if len(entries) < 1:
return None
try:
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
u = cover_url(entries[0])[0].replace('/spic/', '/lpic/');
# If URL contains "book-default", the book doesn't have a cover
if u.find('book-default') != -1:
return None
except:
return None
return u
def has_cover(self, mi, ans, timeout=5.):
if not mi.isbn:
return False
br = browser()
try:
if self.get_cover_url(mi.isbn, br, timeout=timeout) != None:
self.debug('cover for', mi.isbn, 'found')
ans.set()
except Exception, e:
self.debug(e)
def get_covers(self, mi, result_queue, abort, timeout=5.):
if not mi.isbn:
return
br = browser()
try:
url = self.get_cover_url(mi.isbn, br, timeout=timeout)
cover_data = br.open_novisit(url).read()
result_queue.put((True, cover_data, 'jpg', self.name))
except Exception, e:
result_queue.put((False, self.exception_to_string(e),
traceback.format_exc(), self.name))
# }}}
def download_cover(mi, timeout=5.): # {{{ def download_cover(mi, timeout=5.): # {{{
results = Queue() results = Queue()
download_covers(mi, results, max_covers=1, timeout=timeout) download_covers(mi, results, max_covers=1, timeout=timeout)

View File

@ -584,12 +584,42 @@ class LibraryPage(QWizardPage, LibraryUI):
qt_app.load_translations() qt_app.load_translations()
self.emit(SIGNAL('retranslate()')) self.emit(SIGNAL('retranslate()'))
self.init_languages() self.init_languages()
try:
if prefs['language'].lower().startswith('zh'):
from calibre.customize.ui import enable_plugin
for name in ('Douban Books', 'Douban.com covers'):
enable_plugin(name)
except:
pass
def is_library_dir_suitable(self, x):
return LibraryDatabase2.exists_at(x) or not os.listdir(x)
def validatePage(self):
newloc = unicode(self.location.text())
if not self.is_library_dir_suitable(newloc):
self.show_library_dir_error(newloc)
return False
return True
def change(self): def change(self):
dir = choose_dir(self, 'database location dialog', x = choose_dir(self, 'database location dialog',
_('Select location for books')) _('Select location for books'))
if dir: if x:
self.location.setText(dir) if self.is_library_dir_suitable(x):
self.location.setText(x)
else:
self.show_library_dir_error(x)
def show_library_dir_error(self, x):
if not isinstance(x, unicode):
try:
x = x.decode(filesystem_encoding)
except:
x = unicode(repr(x))
error_dialog(self, _('Bad location'),
_('You must choose an empty folder for '
'the calibre library. %s is not empty.')%x, show=True)
def initializePage(self): def initializePage(self):
lp = prefs['library_path'] lp = prefs['library_path']