mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KG updates pre 0.7.21
This commit is contained in:
commit
aef9c916ed
@ -1,8 +1,20 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Tony Stegall'
|
||||
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
|
||||
__version__ = '1.03'
|
||||
__date__ = '27, September 2010'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
import datetime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
now = datetime.datetime.now()
|
||||
title = 'The AJC'
|
||||
language = 'en'
|
||||
timefmt = ' [%a,%d %B %Y %I:%M %p]'
|
||||
__author__ = 'TonytheBookworm'
|
||||
description = 'News from Atlanta and USA'
|
||||
publisher = 'The Atlanta Journal'
|
||||
@ -13,10 +25,14 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
|
||||
masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
|
||||
p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
|
||||
p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
|
||||
|
||||
|
||||
p{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
||||
|
||||
@ -71,9 +87,11 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
|
||||
]
|
||||
|
||||
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
|
||||
credit_tag.name ='p'
|
||||
credit_tag.extract()
|
||||
|
||||
return soup
|
||||
|
||||
|
@ -1,5 +1,14 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Tony Stegall'
|
||||
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
|
||||
__version__ = '1.04'
|
||||
__date__ = '27, September 2010'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
|
||||
|
||||
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
title = 'Nealz Nuze'
|
||||
language = 'en'
|
||||
@ -7,38 +16,15 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
description = 'Neal Boortz Show Radio Notes'
|
||||
publisher = 'Neal Boortz'
|
||||
category = 'news, politics, USA, talkshow'
|
||||
oldest_article = 2
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
linearize_tables = True
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
use_embedded_content = True
|
||||
masthead_url = 'http://boortz.com/images/nuze_logo.gif'
|
||||
keep_only_tags = [
|
||||
dict(name='td', attrs={'id':['contentWellCell']})
|
||||
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='a', attrs={'class':['blogPermalink']}),
|
||||
dict(name='span', attrs={'class':['blogBylineSeparator']}),
|
||||
dict(name='td', attrs={'id':['nealztitle']}),
|
||||
]
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'blogEntryBody'}),]
|
||||
conversion_options = {'linearize_tables' : True}
|
||||
feeds = [
|
||||
('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml')
|
||||
|
||||
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
34
resources/recipes/dhnet_be.recipe
Normal file
34
resources/recipes/dhnet_be.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Lionel Bergeret <lbergeret at gmail.com>'
|
||||
'''
|
||||
dhnet.be
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DHNetBe(BasicNewsRecipe):
|
||||
title = u'La Derniere Heure'
|
||||
__author__ = u'Lionel Bergeret'
|
||||
description = u'News from Belgium in French'
|
||||
publisher = u'dhnet.be'
|
||||
category = 'news, Belgium'
|
||||
oldest_article = 3
|
||||
language = 'fr'
|
||||
|
||||
max_articles_per_feed = 20
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
timefmt = ' [%d %b %Y]'
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name = 'div', attrs = {'id': 'articleText'})
|
||||
,dict(name = 'div', attrs = {'id': 'articlePicureAndLinks'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'La Une' , u'http://www.dhnet.be/rss' )
|
||||
,(u'La Une Sports' , u'http://www.dhnet.be/rss/dhsports/' )
|
||||
,(u'La Une Info' , u'http://www.dhnet.be/rss/dhinfos/' )
|
||||
]
|
43
resources/recipes/lesoir_be.recipe
Normal file
43
resources/recipes/lesoir_be.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Lionel Bergeret <lbergeret at gmail.com>'
|
||||
'''
|
||||
lesoir.be
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LeSoirBe(BasicNewsRecipe):
|
||||
title = u'Le Soir'
|
||||
__author__ = u'Lionel Bergeret'
|
||||
description = u'News from Belgium in French'
|
||||
publisher = u'lesoir.be'
|
||||
category = 'news, Belgium'
|
||||
oldest_article = 3
|
||||
language = 'fr'
|
||||
|
||||
max_articles_per_feed = 20
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
timefmt = ' [%d %b %Y]'
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name = 'div', attrs = {'id': 'story_head'})
|
||||
,dict(name = 'div', attrs = {'id': 'story_body'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='form', attrs={'id':'story_actions'})
|
||||
,dict(name='div', attrs={'id':'sb-share'})
|
||||
,dict(name='div', attrs={'id':'sb-subscribe'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Belgique' , u'http://www.lesoir.be/actualite/belgique/rss.xml' )
|
||||
,(u'France' , u'http://www.lesoir.be/actualite/france/rss.xml' )
|
||||
,(u'Monde' , u'http://www.lesoir.be/actualite/monde/rss.xml' )
|
||||
,(u'Regions' , u'http://www.lesoir.be/regions/rss.xml' )
|
||||
,(u'Vie du Net' , u'http://www.lesoir.be/actualite/vie_du_net/rss.xml' )
|
||||
,(u'Petite Gazette' , u'http://www.lesoir.be/actualite/sciences/rss.xml' )
|
||||
]
|
@ -1,5 +1,5 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
|
||||
import re
|
||||
|
||||
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
title = 'Popular Science'
|
||||
@ -13,35 +13,35 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = True
|
||||
|
||||
|
||||
masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg'
|
||||
|
||||
|
||||
|
||||
|
||||
feeds = [
|
||||
|
||||
|
||||
('Gadgets', 'http://www.popsci.com/full-feed/gadgets'),
|
||||
('Cars', 'http://www.popsci.com/full-feed/cars'),
|
||||
('Science', 'http://www.popsci.com/full-feed/science'),
|
||||
('Technology', 'http://www.popsci.com/full-feed/technology'),
|
||||
('DIY', 'http://www.popsci.com/full-feed/diy'),
|
||||
|
||||
|
||||
]
|
||||
|
||||
|
||||
#The following will get read of the Gallery: links when found
|
||||
|
||||
|
||||
#The following will get read of the Gallery: links when found
|
||||
|
||||
def preprocess_html(self, soup) :
|
||||
print 'SOUP IS: ', soup
|
||||
weblinks = soup.findAll(['head','h2'])
|
||||
if weblinks is not None:
|
||||
for link in weblinks:
|
||||
if re.search('(Gallery)(:)',str(link)):
|
||||
|
||||
|
||||
link.parent.extract()
|
||||
return soup
|
||||
#-----------------------------------------------------------------
|
||||
|
||||
|
||||
#-----------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
telegraph.co.uk
|
||||
'''
|
||||
@ -8,14 +7,16 @@ telegraph.co.uk
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TelegraphUK(BasicNewsRecipe):
|
||||
title = u'Telegraph.co.uk'
|
||||
title = 'Telegraph.co.uk'
|
||||
__author__ = 'Darko Miletic and Sujata Raman'
|
||||
description = 'News from United Kingdom'
|
||||
oldest_article = 7
|
||||
oldest_article = 2
|
||||
category = 'news, politics, UK'
|
||||
publisher = 'Telegraph Media Group ltd.'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
language = 'en'
|
||||
|
||||
language = 'en_GB'
|
||||
remove_empty_feeds = True
|
||||
use_embedded_content = False
|
||||
|
||||
extra_css = '''
|
||||
@ -27,13 +28,20 @@ class TelegraphUK(BasicNewsRecipe):
|
||||
.imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
|
||||
'''
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'storyHead'})
|
||||
,dict(name='div', attrs={'class':'story' })
|
||||
#,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] })
|
||||
dict(name='div', attrs={'class':['storyHead','byline']})
|
||||
,dict(name='div', attrs={'id':'mainBodyArea' })
|
||||
]
|
||||
remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']})
|
||||
#,dict(name='div', attrs={'class':['toolshideoneQuarter']})
|
||||
remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']})
|
||||
,dict(name='ul' , attrs={'class':['shareThis shareBottom']})
|
||||
,dict(name='span', attrs={'class':['num','placeComment']})
|
||||
]
|
||||
|
||||
@ -51,24 +59,7 @@ class TelegraphUK(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
|
||||
url = article.get('guid', None)
|
||||
|
||||
url = article.get('link', None)
|
||||
if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url :
|
||||
url = None
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def postprocess_html(self,soup,first):
|
||||
|
||||
for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}):
|
||||
for pTag in bylineTag.findAll(name='p'):
|
||||
if getattr(pTag.contents[0],"Comments",True):
|
||||
pTag.extract()
|
||||
return soup
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
40
resources/recipes/twtfb.recipe
Normal file
40
resources/recipes/twtfb.recipe
Normal file
@ -0,0 +1,40 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.thewaythefutureblogs.com
|
||||
Frederik Pohl's Blog
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TheWayTheFutureBlogs(BasicNewsRecipe):
|
||||
title = 'The Way the Future Blogs'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = "Frederik Pohl's blog"
|
||||
publisher = 'Frederik Pohl'
|
||||
category = 'news, SF, books'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
extra_css = ' body{font-family: Georgia,serif } '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
remove_tags =[dict(name=['meta','object','embed','iframe','base','link'])]
|
||||
keep_only_tags=[dict(attrs={'class':['post','commentlist']})]
|
||||
remove_attributes=['width','height','lang','border']
|
||||
|
||||
feeds = [(u'Posts', u'http://www.thewaythefutureblogs.com/feed/')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
@ -469,14 +469,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
|
||||
LibraryThing
|
||||
from calibre.ebooks.metadata.douban import DoubanBooks
|
||||
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
|
||||
LibraryThingCovers
|
||||
LibraryThingCovers, DoubanCovers
|
||||
from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
|
||||
from calibre.ebooks.epub.fix.unmanifested import Unmanifested
|
||||
from calibre.ebooks.epub.fix.epubcheck import Epubcheck
|
||||
|
||||
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
|
||||
LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
|
||||
Epubcheck, OpenLibraryCovers, LibraryThingCovers]
|
||||
Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
|
||||
plugins += [
|
||||
ComicInput,
|
||||
EPUBInput,
|
||||
|
@ -1,3 +1,4 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
@ -251,6 +252,9 @@ class OutputProfile(Plugin):
|
||||
#: The character used to represent a star in ratings
|
||||
ratings_char = u'*'
|
||||
|
||||
#: Unsupported unicode characters to be replaced during preprocessing
|
||||
unsupported_unicode_chars = []
|
||||
|
||||
@classmethod
|
||||
def tags_to_string(cls, tags):
|
||||
return escape(', '.join(tags))
|
||||
@ -422,6 +426,8 @@ class SonyReaderOutput(OutputProfile):
|
||||
dpi = 168.451
|
||||
fbase = 12
|
||||
fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
|
||||
unsupported_unicode_chars = [u'\u201f', u'\u201b']
|
||||
|
||||
|
||||
class KoboReaderOutput(OutputProfile):
|
||||
|
||||
|
@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name):
|
||||
config['enabled_plugins'] = ep
|
||||
|
||||
default_disabled_plugins = set([
|
||||
'Douban Books',
|
||||
'Douban Books', 'Douban.com covers',
|
||||
])
|
||||
|
||||
def is_disabled(plugin):
|
||||
|
@ -62,49 +62,104 @@ def wrap_lines(match):
|
||||
else:
|
||||
return ital+' '
|
||||
|
||||
def line_length(format, raw, percent):
|
||||
class DocAnalysis(object):
|
||||
'''
|
||||
raw is the raw text to find the line length to use for wrapping.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use. The list of line lengths is
|
||||
ordered smallest to larged and does not include duplicates. 0.5 is the
|
||||
median value.
|
||||
Provides various text analysis functions to determine how the document is structured.
|
||||
format is the type of document analysis will be done against.
|
||||
raw is the raw text to determine the line length to use for wrapping.
|
||||
Blank lines are excluded from analysis
|
||||
'''
|
||||
raw = raw.replace(' ', ' ')
|
||||
if format == 'html':
|
||||
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
lines = linere.findall(raw)
|
||||
|
||||
lengths = []
|
||||
for line in lines:
|
||||
if len(line) > 0:
|
||||
lengths.append(len(line))
|
||||
def __init__(self, format='html', raw=''):
|
||||
raw = raw.replace(' ', ' ')
|
||||
if format == 'html':
|
||||
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
if not lengths:
|
||||
return 0
|
||||
def line_length(self, percent):
|
||||
'''
|
||||
Analyses the document to find the median line length.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use. The list of line lengths is
|
||||
ordered smallest to larged and does not include duplicates. 0.5 is the
|
||||
median value.
|
||||
'''
|
||||
lengths = []
|
||||
for line in self.lines:
|
||||
if len(line) > 0:
|
||||
lengths.append(len(line))
|
||||
|
||||
lengths = list(set(lengths))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = avg * 2
|
||||
if not lengths:
|
||||
return 0
|
||||
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
if lengths[i] > max_line:
|
||||
del lengths[i]
|
||||
lengths = list(set(lengths))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = avg * 2
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
if percent < 0:
|
||||
percent = 0
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
if lengths[i] > max_line:
|
||||
del lengths[i]
|
||||
|
||||
index = int(len(lengths) * percent) - 1
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
if percent < 0:
|
||||
percent = 0
|
||||
|
||||
return lengths[index]
|
||||
index = int(len(lengths) * percent) - 1
|
||||
|
||||
return lengths[index]
|
||||
|
||||
def line_histogram(self, percent):
|
||||
'''
|
||||
Creates a broad histogram of the document to determine whether it incorporates hard
|
||||
line breaks. Lines are sorted into 20 'buckets' based on length.
|
||||
percent is the percentage of lines that should be in a single bucket to return true
|
||||
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
|
||||
'''
|
||||
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength=1900 # Discard larger than this to stay in range
|
||||
buckets=20 # Each line is divided into a bucket based on length
|
||||
|
||||
#print "there are "+str(len(lines))+" lines"
|
||||
#max = 0
|
||||
#for line in self.lines:
|
||||
# l = len(line)
|
||||
# if l > max:
|
||||
# max = l
|
||||
#print "max line found is "+str(max)
|
||||
# Build the line length histogram
|
||||
hRaw = [ 0 for i in range(0,buckets) ]
|
||||
for line in self.lines:
|
||||
l = len(line)
|
||||
if l > minLineLength and l < maxLineLength:
|
||||
l = int(l/100)
|
||||
#print "adding "+str(l)
|
||||
hRaw[l]+=1
|
||||
|
||||
# Normalize the histogram into percents
|
||||
totalLines = len(self.lines)
|
||||
h = [ float(count)/totalLines for count in hRaw ]
|
||||
#print "\nhRaw histogram lengths are: "+str(hRaw)
|
||||
#print " percents are: "+str(h)+"\n"
|
||||
|
||||
# Find the biggest bucket
|
||||
maxValue = 0
|
||||
for i in range(0,len(h)):
|
||||
if h[i] > maxValue:
|
||||
maxValue = h[i]
|
||||
|
||||
if maxValue < percent:
|
||||
#print "Line lengths are too variable. Not unwrapping."
|
||||
return False
|
||||
else:
|
||||
#print str(maxValue)+" of the lines were in one bucket"
|
||||
return True
|
||||
|
||||
class Dehyphenator(object):
|
||||
'''
|
||||
@ -117,42 +172,62 @@ class Dehyphenator(object):
|
||||
def __init__(self):
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
|
||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
||||
|
||||
def dehyphenate(self, match):
|
||||
firsthalf = match.group('firstpart')
|
||||
secondhalf = match.group('secondpart')
|
||||
try:
|
||||
wraptags = match.group('wraptags')
|
||||
except:
|
||||
wraptags = ''
|
||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
if self.prefixes.match(firsthalf) is None:
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
|
||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||
match = booklookup.search(self.html)
|
||||
if match:
|
||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
#print "returned hyphenated word: " + str(hyphenated)
|
||||
try:
|
||||
searchresult = self.html.find(str.lower(lookupword))
|
||||
except:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||
return dehyphenated
|
||||
elif self.html.find(hyphenated) != -1:
|
||||
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
||||
return hyphenated
|
||||
else:
|
||||
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
#print " returned hyphenated word: " + str(hyphenated)
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
self.html = html
|
||||
self.format = format
|
||||
if format == 'html':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
|
||||
|
||||
class CSSPreProcessor(object):
|
||||
|
||||
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
||||
@ -286,7 +361,7 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||
|
||||
# Detect Chapters to match default XPATH in GUI
|
||||
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
|
||||
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
|
||||
# Cover the case where every letter in a chapter title is separated by a space
|
||||
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
|
||||
|
||||
@ -374,10 +449,8 @@ class HTMLPreProcessor(object):
|
||||
print 'Failed to parse remove_footer regexp'
|
||||
traceback.print_exc()
|
||||
|
||||
# unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
@ -391,12 +464,15 @@ class HTMLPreProcessor(object):
|
||||
|
||||
length = -1
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||
docanalysis = DocAnalysis('pdf', html)
|
||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||
if length:
|
||||
# print "The pdf line length returned is " + str(length)
|
||||
#print "The pdf line length returned is " + str(length)
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
(re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
@ -454,6 +530,14 @@ class HTMLPreProcessor(object):
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = self.smarten_punctuation(html)
|
||||
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
if unsupported_unicode_chars:
|
||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||
unidecoder = Unidecoder()
|
||||
for char in unsupported_unicode_chars:
|
||||
asciichar = unidecoder.decode(char)
|
||||
html = html.replace(char, asciichar)
|
||||
|
||||
return html
|
||||
|
||||
def smarten_punctuation(self, html):
|
||||
|
@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
|
||||
class PreProcessor(object):
|
||||
@ -77,13 +77,18 @@ class PreProcessor(object):
|
||||
|
||||
def __call__(self, html):
|
||||
self.log("********* Preprocessing HTML *********")
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
|
||||
|
||||
###### Check Markup ######
|
||||
#
|
||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||
# <pre> tags), check and mark up line endings if required before proceeding
|
||||
if self.no_markup(html, 0.1):
|
||||
self.log("not enough paragraph markers, adding now")
|
||||
# check if content is in pre tags, use txt procesor to mark up if so
|
||||
# check if content is in pre tags, use txt processor to mark up if so
|
||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||
if len(pre.findall(html)) == 1:
|
||||
self.log("Running Text Processing")
|
||||
@ -113,47 +118,77 @@ class PreProcessor(object):
|
||||
# Get rid of empty <o:p> tags to simplify other processing
|
||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||
# Get rid of empty span, bold, & italics tags
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
|
||||
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||
# If more than 40% of the lines are empty paragraphs and the user has enabled remove
|
||||
# paragraph spacing then delete blank lines to clean up spacing
|
||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
|
||||
blanklines = blankreg.findall(html)
|
||||
lines = linereg.findall(html)
|
||||
blanks_between_paragraphs = False
|
||||
if len(lines) > 1:
|
||||
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
|
||||
'remove_paragraph_spacing', False):
|
||||
self.log("deleting blank lines")
|
||||
html = blankreg.sub('', html)
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
|
||||
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||
blanks_between_paragraphs = True
|
||||
#print "blanks between paragraphs is marked True"
|
||||
else:
|
||||
blanks_between_paragraphs = False
|
||||
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
#
|
||||
# Build the Regular Expressions in pieces
|
||||
lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
chapter_header_close = ")\s*"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
|
||||
if blanks_between_paragraphs:
|
||||
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
title_header_open = "(?P<title>"
|
||||
title_header_close = ")\s*"
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
|
||||
opt_title_close = ")?"
|
||||
|
||||
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
|
||||
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
|
||||
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
|
||||
uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
|
||||
|
||||
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
#print chapter_marker
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
||||
#
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
if self.html_preprocess_sections < 10:
|
||||
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
if self.html_preprocess_sections < 10:
|
||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
|
||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||
chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
html = chapdetect2.sub(self.chapter_head, html)
|
||||
|
||||
if self.html_preprocess_sections < 10:
|
||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
|
||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||
chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||
html = chapdetect2.sub(self.chapter_head, html)
|
||||
|
||||
###### Unwrap lines ######
|
||||
#
|
||||
self.log("Unwrapping Lines")
|
||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||
# that lines can be un-wrapped across page boundaries
|
||||
@ -168,25 +203,40 @@ class PreProcessor(object):
|
||||
format = 'html'
|
||||
else:
|
||||
format = 'html'
|
||||
|
||||
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
||||
# more of the lines break in the same region of the document then unwrapping is required
|
||||
docanalysis = DocAnalysis(format, html)
|
||||
hardbreaks = docanalysis.line_histogram(.50)
|
||||
self.log("Hard line breaks check returned "+str(hardbreaks))
|
||||
# Calculate Length
|
||||
length = line_length(format, html, getattr(self.extra_opts,
|
||||
'html_unwrap_factor', 0.4))
|
||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||
length = docanalysis.line_length(unwrap_factor)
|
||||
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
|
||||
max_length = length * 1.4
|
||||
min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})")
|
||||
#
|
||||
# Unwrap em/en dashes, delete soft-hyphens
|
||||
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
|
||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'html', length)
|
||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||
if hardbreaks or unwrap_factor < 0.4:
|
||||
self.log("Unwrapping required, unwrapping Lines")
|
||||
# Unwrap em/en dashes
|
||||
html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
|
||||
# Dehyphenate
|
||||
self.log("Unwrapping/Removing hyphens")
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'html', length)
|
||||
self.log("Done dehyphenating")
|
||||
# Unwrap lines using punctation and line length
|
||||
unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
html = unwrap.sub(' ', html)
|
||||
#check any remaining hyphens, but only unwrap if there is a match
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
else:
|
||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||
self.log("Cleaning up hyphenation")
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
self.log("Done dehyphenating")
|
||||
|
||||
# Unwrap lines using punctation and line length
|
||||
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
html = unwrap.sub(' ', html)
|
||||
# delete soft hyphens
|
||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < 10:
|
||||
|
@ -9,6 +9,7 @@ import traceback, socket, re, sys
|
||||
from functools import partial
|
||||
from threading import Thread, Event
|
||||
from Queue import Queue, Empty
|
||||
from lxml import etree
|
||||
|
||||
import mechanize
|
||||
|
||||
@ -216,6 +217,68 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{
|
||||
|
||||
# }}}
|
||||
|
||||
class DoubanCovers(CoverDownload): # {{{
|
||||
'Download covers from Douban.com'
|
||||
|
||||
DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
|
||||
CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
|
||||
name = 'Douban.com covers'
|
||||
description = _('Download covers from Douban.com')
|
||||
author = 'Li Fanxi'
|
||||
|
||||
def get_cover_url(self, isbn, br, timeout=5.):
|
||||
try:
|
||||
url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY
|
||||
src = br.open(url, timeout=timeout).read()
|
||||
except Exception, err:
|
||||
if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
|
||||
err = Exception(_('Douban.com API timed out. Try again later.'))
|
||||
raise err
|
||||
else:
|
||||
feed = etree.fromstring(src)
|
||||
NAMESPACES = {
|
||||
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||
'atom' : 'http://www.w3.org/2005/Atom',
|
||||
'db': 'http://www.douban.com/xmlns/'
|
||||
}
|
||||
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||
entries = XPath('//atom:entry')(feed)
|
||||
if len(entries) < 1:
|
||||
return None
|
||||
try:
|
||||
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
|
||||
u = cover_url(entries[0])[0].replace('/spic/', '/lpic/');
|
||||
# If URL contains "book-default", the book doesn't have a cover
|
||||
if u.find('book-default') != -1:
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
return u
|
||||
|
||||
def has_cover(self, mi, ans, timeout=5.):
|
||||
if not mi.isbn:
|
||||
return False
|
||||
br = browser()
|
||||
try:
|
||||
if self.get_cover_url(mi.isbn, br, timeout=timeout) != None:
|
||||
self.debug('cover for', mi.isbn, 'found')
|
||||
ans.set()
|
||||
except Exception, e:
|
||||
self.debug(e)
|
||||
|
||||
def get_covers(self, mi, result_queue, abort, timeout=5.):
|
||||
if not mi.isbn:
|
||||
return
|
||||
br = browser()
|
||||
try:
|
||||
url = self.get_cover_url(mi.isbn, br, timeout=timeout)
|
||||
cover_data = br.open_novisit(url).read()
|
||||
result_queue.put((True, cover_data, 'jpg', self.name))
|
||||
except Exception, e:
|
||||
result_queue.put((False, self.exception_to_string(e),
|
||||
traceback.format_exc(), self.name))
|
||||
# }}}
|
||||
|
||||
def download_cover(mi, timeout=5.): # {{{
|
||||
results = Queue()
|
||||
download_covers(mi, results, max_covers=1, timeout=timeout)
|
||||
|
@ -217,9 +217,12 @@ def fetch_scheduled_recipe(arg):
|
||||
if 'output_profile' in ps:
|
||||
recs.append(('output_profile', ps['output_profile'],
|
||||
OptionRecommendation.HIGH))
|
||||
if ps['output_profile'] == 'kindle':
|
||||
recs.append(('no_inline_toc', True,
|
||||
OptionRecommendation.HIGH))
|
||||
# Disabled since apparently some people use
|
||||
# K4PC and, surprise, surprise, it doesn't support
|
||||
# indexed MOBIs.
|
||||
#if ps['output_profile'] == 'kindle':
|
||||
# recs.append(('no_inline_toc', True,
|
||||
# OptionRecommendation.HIGH))
|
||||
|
||||
lf = load_defaults('look_and_feel')
|
||||
if lf.get('base_font_size', 0.0) != 0.0:
|
||||
|
@ -584,12 +584,42 @@ class LibraryPage(QWizardPage, LibraryUI):
|
||||
qt_app.load_translations()
|
||||
self.emit(SIGNAL('retranslate()'))
|
||||
self.init_languages()
|
||||
try:
|
||||
if prefs['language'].lower().startswith('zh'):
|
||||
from calibre.customize.ui import enable_plugin
|
||||
for name in ('Douban Books', 'Douban.com covers'):
|
||||
enable_plugin(name)
|
||||
except:
|
||||
pass
|
||||
|
||||
def is_library_dir_suitable(self, x):
|
||||
return LibraryDatabase2.exists_at(x) or not os.listdir(x)
|
||||
|
||||
def validatePage(self):
|
||||
newloc = unicode(self.location.text())
|
||||
if not self.is_library_dir_suitable(newloc):
|
||||
self.show_library_dir_error(newloc)
|
||||
return False
|
||||
return True
|
||||
|
||||
def change(self):
|
||||
dir = choose_dir(self, 'database location dialog',
|
||||
x = choose_dir(self, 'database location dialog',
|
||||
_('Select location for books'))
|
||||
if dir:
|
||||
self.location.setText(dir)
|
||||
if x:
|
||||
if self.is_library_dir_suitable(x):
|
||||
self.location.setText(x)
|
||||
else:
|
||||
self.show_library_dir_error(x)
|
||||
|
||||
def show_library_dir_error(self, x):
|
||||
if not isinstance(x, unicode):
|
||||
try:
|
||||
x = x.decode(filesystem_encoding)
|
||||
except:
|
||||
x = unicode(repr(x))
|
||||
error_dialog(self, _('Bad location'),
|
||||
_('You must choose an empty folder for '
|
||||
'the calibre library. %s is not empty.')%x, show=True)
|
||||
|
||||
def initializePage(self):
|
||||
lp = prefs['library_path']
|
||||
|
@ -282,7 +282,7 @@ Watch the tutorial `UI Power tips <http://calibre-ebook.com/demo#tutorials>`_ to
|
||||
|
||||
How do I move my |app| library from one computer to another?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder.
|
||||
Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder. If the computer you are transferring too already has a calibre installation, then the Welcome wizard wont run. In that case, click the calibre icon in the tooolbar and point it to the newly copied directory. You will now have two calibre libraries on your computer and you can switch between them by clicking the calibre icon on the toolbar.
|
||||
|
||||
Note that if you are transferring between different types of computers (for example Windows to OS X) then after doing the above you should also go to :guilabel:`Preferences->Advanced->Miscellaneous` and click the "Check database integrity button". It will warn you about missing files, if any, which you should then transfer by hand.
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user