mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
6f6dbc0d35
@ -1,6 +1,4 @@
|
|||||||
@echo OFF
|
@echo OFF
|
||||||
REM CalibreRun.bat
|
|
||||||
REM ~~~~~~~~~~~~~~
|
|
||||||
REM Batch File to start a Calibre configuration on Windows
|
REM Batch File to start a Calibre configuration on Windows
|
||||||
REM giving explicit control of the location of:
|
REM giving explicit control of the location of:
|
||||||
REM - Calibe Program Files
|
REM - Calibe Program Files
|
||||||
@ -24,7 +22,10 @@ REM -------------------------------------
|
|||||||
REM Set up Calibre Config folder
|
REM Set up Calibre Config folder
|
||||||
REM -------------------------------------
|
REM -------------------------------------
|
||||||
|
|
||||||
If EXIST CalibreConfig SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
|
IF EXIST CalibreConfig (
|
||||||
|
SET CALIBRE_CONFIG_DIRECTORY=%cd%\CalibreConfig
|
||||||
|
ECHO CONFIG=%cd%\CalibreConfig
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
@ -38,9 +39,18 @@ REM drive letter of the USB stick.
|
|||||||
REM Comment out any of the following that are not to be used
|
REM Comment out any of the following that are not to be used
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
|
|
||||||
SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
|
IF EXIST U:\eBooks\CalibreLibrary (
|
||||||
IF EXIST CalibreLibrary SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
|
SET CALIBRE_LIBRARY_DIRECTORY=U:\eBOOKS\CalibreLibrary
|
||||||
IF EXIST CalibreBooks SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
|
ECHO LIBRARY=U:\eBOOKS\CalibreLibrary
|
||||||
|
)
|
||||||
|
IF EXIST CalibreLibrary (
|
||||||
|
SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreLibrary
|
||||||
|
ECHO LIBRARY=%cd%\CalibreLibrary
|
||||||
|
)
|
||||||
|
IF EXIST CalibreBooks (
|
||||||
|
SET CALIBRE_LIBRARY_DIRECTORY=%cd%\CalibreBooks
|
||||||
|
ECHO LIBRARY=%cd%\CalibreBooks
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
@ -50,12 +60,32 @@ REM Location where the metadata.db file is located. If not set
|
|||||||
REM the same location as Books files will be assumed. This.
|
REM the same location as Books files will be assumed. This.
|
||||||
REM options is used to get better performance when the Library is
|
REM options is used to get better performance when the Library is
|
||||||
REM on a (slow) network drive. Putting the metadata.db file
|
REM on a (slow) network drive. Putting the metadata.db file
|
||||||
REM locally gives a big performance improvement.
|
REM locally makes gives a big performance improvement.
|
||||||
|
REM
|
||||||
|
REM NOTE. If you use this option, then the ability to switch
|
||||||
|
REM libraries within Calibre will be disabled. Therefore
|
||||||
|
REM you do not want to set it if the metadata.db file
|
||||||
|
REM is at the same location as the book files.
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
|
|
||||||
IF EXIST CalibreBooks SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
|
IF EXIST CalibreBooks (
|
||||||
IF EXIST CalibreMetadata SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
|
IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreBooks" (
|
||||||
|
SET SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreBooks\metadata.db
|
||||||
|
ECHO DATABASE=%cd%\CalibreBooks\metadata.db
|
||||||
|
ECHO '
|
||||||
|
ECHO ***CAUTION*** Library Switching will be disabled
|
||||||
|
ECHO '
|
||||||
|
)
|
||||||
|
)
|
||||||
|
IF EXIST CalibreMetadata (
|
||||||
|
IF NOT "%CALIBRE_LIBRARY_DIRECTORY%" == "%cd%\CalibreMetadata" (
|
||||||
|
SET CALIBRE_OVERRIDE_DATABASE_PATH=%cd%\CalibreMetadata\metadata.db
|
||||||
|
ECHO DATABASE=%cd%\CalibreMetadata\metadata.db
|
||||||
|
ECHO '
|
||||||
|
ECHO ***CAUTION*** Library Switching will be disabled
|
||||||
|
ECHO '
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
REM Specify Location of source (optional)
|
REM Specify Location of source (optional)
|
||||||
@ -63,13 +93,20 @@ REM
|
|||||||
REM It is easy to run Calibre from source
|
REM It is easy to run Calibre from source
|
||||||
REM Just set the environment variable to where the source is located
|
REM Just set the environment variable to where the source is located
|
||||||
REM When running from source the GUI will have a '*' after the version.
|
REM When running from source the GUI will have a '*' after the version.
|
||||||
|
REM number that is displayed at the bottom of the Calibre main screen.
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
|
|
||||||
IF EXIST Calibre\src SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
|
IF EXIST Calibre\src (
|
||||||
|
SET CALIBRE_DEVELOP_FROM=%cd%\Calibre\src
|
||||||
|
ECHO SOURCE=%cd%\Calibre\src
|
||||||
|
)
|
||||||
|
IF EXIST D:\Calibre\Calibre\src (
|
||||||
|
SET CALIBRE_DEVELOP_FROM=D:\Calibre\Calibre\src
|
||||||
|
ECHO SOURCE=D:\Calibre\Calibre\src
|
||||||
|
)
|
||||||
|
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
REM Specify Location of calibre binaries (optinal)
|
REM Specify Location of calibre binaries (optional)
|
||||||
REM
|
REM
|
||||||
REM To avoid needing Calibre to be set in the search path, ensure
|
REM To avoid needing Calibre to be set in the search path, ensure
|
||||||
REM that Calibre Program Files is current directory when starting.
|
REM that Calibre Program Files is current directory when starting.
|
||||||
@ -78,21 +115,15 @@ REM This folder can be populated by cpying the Calibre2 folder from
|
|||||||
REM an existing isntallation or by isntalling direct to here.
|
REM an existing isntallation or by isntalling direct to here.
|
||||||
REM --------------------------------------------------------------
|
REM --------------------------------------------------------------
|
||||||
|
|
||||||
IF EXIST Calibre2 CD Calibre2
|
IF EXIST Calibre2 (
|
||||||
|
Calibre2 CD Calibre2
|
||||||
|
ECHO PROGRAMS=%cd%
|
||||||
REM --------------------------------------------
|
)
|
||||||
REM Display settings that will be used
|
|
||||||
REM --------------------------------------------
|
|
||||||
|
|
||||||
echo PROGRAMS=%cd%
|
|
||||||
echo SOURCE=%CALIBRE_DEVELOP_FROM%
|
|
||||||
echo CONFIG=%CALIBRE_CONFIG_DIRECTORY%
|
|
||||||
echo LIBRARY=%CALIBRE_LIBRARY_DIRECTORY%
|
|
||||||
echo DATABASE=%CALIBRE_OVERRIDE_DATABASE_PATH%
|
|
||||||
|
|
||||||
|
REM ----------------------------------------------------------
|
||||||
REM The following gives a chance to check the settings before
|
REM The following gives a chance to check the settings before
|
||||||
REM starting Calibre. It can be commented out if not wanted.
|
REM starting Calibre. It can be commented out if not wanted.
|
||||||
|
REM ----------------------------------------------------------
|
||||||
|
|
||||||
echo "Press CTRL-C if you do not want to continue"
|
echo "Press CTRL-C if you do not want to continue"
|
||||||
pause
|
pause
|
||||||
@ -111,4 +142,4 @@ REM Use with /WAIT to wait until Calibre completes to run a task on exit
|
|||||||
REM --------------------------------------------------------
|
REM --------------------------------------------------------
|
||||||
|
|
||||||
echo "Starting up Calibre"
|
echo "Starting up Calibre"
|
||||||
START /belownormal Calibre --with-library %CALIBRE_LIBRARY_DIRECTORY%
|
START /belownormal Calibre --with-library "%CALIBRE_LIBRARY_DIRECTORY%"
|
||||||
|
BIN
resources/images/heuristics.png
Normal file
BIN
resources/images/heuristics.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 9.3 KiB |
@ -7,22 +7,29 @@ class DallasNews(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_tags_before = dict(name='h2', attrs={'class':'vitstoryheadline'})
|
use_embedded_content = False
|
||||||
remove_tags_after = dict(name='div', attrs={'style':'width: 100%; clear: right'})
|
remove_tags_before = dict(name='h1')
|
||||||
remove_tags_after = dict(name='div', attrs={'id':'article_tools_bottom'})
|
keep_only_tags = {'class':lambda x: x and 'article' in x}
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='iframe'),
|
{'class':['DMNSocialTools', 'article ', 'article first ', 'article premium']},
|
||||||
dict(name='div', attrs={'class':'biblockmore'}),
|
|
||||||
dict(name='div', attrs={'style':'width: 100%; clear: right'}),
|
|
||||||
dict(name='div', attrs={'id':'article_tools_bottom'}),
|
|
||||||
#dict(name='ul', attrs={'class':'articleTools'}),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Latest News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslatestnews.xml'),
|
('Local News',
|
||||||
('Local News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslocalnews.xml'),
|
'http://www.dallasnews.com/news/politics/local-politics/?rss'),
|
||||||
('Nation and World', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml'),
|
('National Politics',
|
||||||
('Politics', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml'),
|
'http://www.dallasnews.com/news/politics/national-politic/?rss'),
|
||||||
('Science', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsscience.xml'),
|
('State Politics',
|
||||||
|
'http://www.dallasnews.com/news/politics/state-politics/?rss'),
|
||||||
|
('Religion',
|
||||||
|
'http://www.dallasnews.com/news/religion/?rss'),
|
||||||
|
('Crime',
|
||||||
|
'http://www.dallasnews.com/news/crime/headlines/?rss'),
|
||||||
|
('Celebrity News',
|
||||||
|
'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
|
||||||
|
('Nation',
|
||||||
|
'http://www.dallasnews.com/news/nation-world/nation/?rss'),
|
||||||
|
('World',
|
||||||
|
'http://www.dallasnews.com/news/nation-world/world/?rss'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -20,8 +20,8 @@ class LaVanguardia(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
delay = 1
|
delay = 5
|
||||||
encoding = 'cp1252'
|
# encoding = 'cp1252'
|
||||||
language = 'es'
|
language = 'es'
|
||||||
|
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
@ -35,7 +35,7 @@ class LaVanguardia(BasicNewsRecipe):
|
|||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Ciudadanos' , u'http://feeds.feedburner.com/lavanguardia/ciudadanos' )
|
(u'Portada' , u'http://feeds.feedburner.com/lavanguardia/home' )
|
||||||
,(u'Cultura' , u'http://feeds.feedburner.com/lavanguardia/cultura' )
|
,(u'Cultura' , u'http://feeds.feedburner.com/lavanguardia/cultura' )
|
||||||
,(u'Deportes' , u'http://feeds.feedburner.com/lavanguardia/deportes' )
|
,(u'Deportes' , u'http://feeds.feedburner.com/lavanguardia/deportes' )
|
||||||
,(u'Economia' , u'http://feeds.feedburner.com/lavanguardia/economia' )
|
,(u'Economia' , u'http://feeds.feedburner.com/lavanguardia/economia' )
|
||||||
@ -45,17 +45,17 @@ class LaVanguardia(BasicNewsRecipe):
|
|||||||
,(u'Internet y tecnologia', u'http://feeds.feedburner.com/lavanguardia/internet' )
|
,(u'Internet y tecnologia', u'http://feeds.feedburner.com/lavanguardia/internet' )
|
||||||
,(u'Motor' , u'http://feeds.feedburner.com/lavanguardia/motor' )
|
,(u'Motor' , u'http://feeds.feedburner.com/lavanguardia/motor' )
|
||||||
,(u'Politica' , u'http://feeds.feedburner.com/lavanguardia/politica' )
|
,(u'Politica' , u'http://feeds.feedburner.com/lavanguardia/politica' )
|
||||||
,(u'Sucessos' , u'http://feeds.feedburner.com/lavanguardia/sucesos' )
|
,(u'Sucesos' , u'http://feeds.feedburner.com/lavanguardia/sucesos' )
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'class':'element1_3'})
|
dict(name='div', attrs={'class':'detalle noticia'})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['object','link','script'])
|
dict(name=['object','link','script'])
|
||||||
,dict(name='div', attrs={'class':['colC','peu']})
|
,dict(name='div', attrs={'class':['colC','peu','jstoolbar']})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags_after = [dict(name='div', attrs={'class':'text'})]
|
remove_tags_after = [dict(name='div', attrs={'class':'text'})]
|
||||||
@ -67,4 +67,3 @@ class LaVanguardia(BasicNewsRecipe):
|
|||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -159,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'relatedSearchesModule',
|
'relatedSearchesModule',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'singleAd',
|
'singleAd',
|
||||||
|
'entry entry-utility', #added for DealBook
|
||||||
|
'entry-tags', #added for DealBook
|
||||||
|
'footer promos clearfix', #added for DealBook
|
||||||
|
'footer links clearfix', #added for DealBook
|
||||||
|
'inlineImage module', #added for DealBook
|
||||||
re.compile('^subNavigation'),
|
re.compile('^subNavigation'),
|
||||||
re.compile('^leaderboard'),
|
re.compile('^leaderboard'),
|
||||||
re.compile('^module'),
|
re.compile('^module'),
|
||||||
@ -192,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'side_index',
|
'side_index',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'toolsRight',
|
'toolsRight',
|
||||||
|
'skybox', #added for DealBook
|
||||||
|
'TopAd', #added for DealBook
|
||||||
|
'related-content', #added for DealBook
|
||||||
]),
|
]),
|
||||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -246,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
def exclude_url(self,url):
|
def exclude_url(self,url):
|
||||||
if not url.startswith("http"):
|
if not url.startswith("http"):
|
||||||
return True
|
return True
|
||||||
if not url.endswith(".html"):
|
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
||||||
return True
|
return True
|
||||||
if 'nytimes.com' not in url:
|
if 'nytimes.com' not in url:
|
||||||
return True
|
return True
|
||||||
@ -569,7 +577,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
if self.webEdition & (self.oldest_article>0):
|
if self.webEdition & (self.oldest_article>0):
|
||||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||||
if date_tag:
|
if date_tag:
|
||||||
@ -592,9 +599,12 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
img_div = soup.find('div','inlineImage module')
|
img_div = soup.find('div','inlineImage module')
|
||||||
if img_div:
|
if img_div:
|
||||||
img_div.extract()
|
img_div.extract()
|
||||||
|
|
||||||
|
|
||||||
return self.strip_anchors(soup)
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
def postprocess_html(self,soup, True):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.one_picture_per_article:
|
if self.one_picture_per_article:
|
||||||
# Remove all images after first
|
# Remove all images after first
|
||||||
@ -650,6 +660,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
try:
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
|
blogheadline = str(h1) #added for dealbook
|
||||||
if h1:
|
if h1:
|
||||||
headline = h1.find("nyt_headline")
|
headline = h1.find("nyt_headline")
|
||||||
if headline:
|
if headline:
|
||||||
@ -657,13 +668,19 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
h1.replaceWith(tag)
|
h1.replaceWith(tag)
|
||||||
|
elif blogheadline.find('entry-title'):#added for dealbook
|
||||||
|
tag = Tag(soup, "h2")#added for dealbook
|
||||||
|
tag['class'] = "headline"#added for dealbook
|
||||||
|
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
||||||
|
h1.replaceWith(tag)#added for dealbook
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Blog entry - replace headline, remove <hr> tags
|
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||||
headline = soup.find('title')
|
headline = soup.find('title')
|
||||||
if headline:
|
if headline:
|
||||||
tag = Tag(soup, "h2")
|
tag = Tag(soup, "h2")
|
||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.renderContents()))
|
||||||
soup.insert(0, tag)
|
soup.insert(0, tag)
|
||||||
hrs = soup.findAll('hr')
|
hrs = soup.findAll('hr')
|
||||||
for hr in hrs:
|
for hr in hrs:
|
||||||
@ -671,6 +688,29 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
|
try:
|
||||||
|
#if this is from a blog (dealbook, fix the byline format
|
||||||
|
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||||
|
if bylineauthor:
|
||||||
|
tag = Tag(soup, "h6")
|
||||||
|
tag['class'] = "byline"
|
||||||
|
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
||||||
|
bylineauthor.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: fixing byline author format")
|
||||||
|
|
||||||
|
try:
|
||||||
|
#if this is a blog (dealbook) fix the credit style for the pictures
|
||||||
|
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||||
|
if blogcredit:
|
||||||
|
tag = Tag(soup, "h6")
|
||||||
|
tag['class'] = "credit"
|
||||||
|
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
||||||
|
blogcredit.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: fixing credit format")
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
masthead = soup.find("h1")
|
masthead = soup.find("h1")
|
||||||
@ -693,6 +733,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
subhead.replaceWith(bTag)
|
subhead.replaceWith(bTag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
try:
|
||||||
|
#remove the <strong> update tag
|
||||||
|
blogupdated = soup.find('span', {'class':'update'})
|
||||||
|
if blogupdated:
|
||||||
|
blogupdated.replaceWith("")
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Removing strong tag")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
|
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
|
|||||||
'''
|
'''
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def preprocess_html(self, opts, html):
|
|
||||||
'''
|
|
||||||
This method is called by the conversion pipeline on all HTML before it
|
|
||||||
is parsed. It is meant to be used to do any required preprocessing on
|
|
||||||
the HTML, like removing hard line breaks, etc.
|
|
||||||
|
|
||||||
:param html: A unicode string
|
|
||||||
:return: A unicode string
|
|
||||||
'''
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
'''
|
'''
|
||||||
This method must be implemented in sub-classes. It must return
|
This method must be implemented in sub-classes. It must return
|
||||||
|
@ -98,6 +98,9 @@ class PRS505(USBMS):
|
|||||||
|
|
||||||
THUMBNAIL_HEIGHT = 200
|
THUMBNAIL_HEIGHT = 200
|
||||||
|
|
||||||
|
MAX_PATH_LEN = 201 # 250 - (max(len(CACHE_THUMBNAIL), len(MEDIA_THUMBNAIL)) +
|
||||||
|
# len('main_thumbnail.jpg') + 1)
|
||||||
|
|
||||||
def windows_filter_pnp_id(self, pnp_id):
|
def windows_filter_pnp_id(self, pnp_id):
|
||||||
return '_LAUNCHER' in pnp_id
|
return '_LAUNCHER' in pnp_id
|
||||||
|
|
||||||
@ -225,12 +228,6 @@ class PRS505(USBMS):
|
|||||||
self.plugboards = plugboards
|
self.plugboards = plugboards
|
||||||
self.plugboard_func = pb_func
|
self.plugboard_func = pb_func
|
||||||
|
|
||||||
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
|
||||||
maxlen = 250 - (max(len(CACHE_THUMBNAIL), len(MEDIA_THUMBNAIL)) +
|
|
||||||
len('main_thumbnail.jpg') + 1)
|
|
||||||
return self._create_upload_path(path, mdata, fname,
|
|
||||||
create_dirs=create_dirs, maxlen=maxlen)
|
|
||||||
|
|
||||||
def upload_cover(self, path, filename, metadata, filepath):
|
def upload_cover(self, path, filename, metadata, filepath):
|
||||||
opts = self.settings()
|
opts = self.settings()
|
||||||
if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
|
if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
|
||||||
@ -238,7 +235,10 @@ class PRS505(USBMS):
|
|||||||
debug_print('PRS505: not uploading cover')
|
debug_print('PRS505: not uploading cover')
|
||||||
return
|
return
|
||||||
debug_print('PRS505: uploading cover')
|
debug_print('PRS505: uploading cover')
|
||||||
|
try:
|
||||||
self._upload_cover(path, filename, metadata, filepath)
|
self._upload_cover(path, filename, metadata, filepath)
|
||||||
|
except:
|
||||||
|
debug_print('FAILED to upload cover', filepath)
|
||||||
|
|
||||||
def _upload_cover(self, path, filename, metadata, filepath):
|
def _upload_cover(self, path, filename, metadata, filepath):
|
||||||
if metadata.thumbnail and metadata.thumbnail[-1]:
|
if metadata.thumbnail and metadata.thumbnail[-1]:
|
||||||
|
@ -98,6 +98,9 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
# copy these back to the library
|
# copy these back to the library
|
||||||
BACKLOADING_ERROR_MESSAGE = None
|
BACKLOADING_ERROR_MESSAGE = None
|
||||||
|
|
||||||
|
#: The maximum length of paths created on the device
|
||||||
|
MAX_PATH_LEN = 250
|
||||||
|
|
||||||
def reset(self, key='-1', log_packets=False, report_progress=None,
|
def reset(self, key='-1', log_packets=False, report_progress=None,
|
||||||
detected_device=None):
|
detected_device=None):
|
||||||
self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
|
self._main_prefix = self._card_a_prefix = self._card_b_prefix = None
|
||||||
@ -874,12 +877,8 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
||||||
return self._create_upload_path(path, mdata, fname,
|
|
||||||
create_dirs=create_dirs, maxlen=250)
|
|
||||||
|
|
||||||
def _create_upload_path(self, path, mdata, fname, create_dirs=True,
|
|
||||||
maxlen=None):
|
|
||||||
path = os.path.abspath(path)
|
path = os.path.abspath(path)
|
||||||
|
maxlen = self.MAX_PATH_LEN
|
||||||
|
|
||||||
special_tag = None
|
special_tag = None
|
||||||
if mdata.tags:
|
if mdata.tags:
|
||||||
|
@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin):
|
|||||||
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
from calibre.ebooks.oeb.base import DirContainer
|
from calibre.ebooks.oeb.base import DirContainer
|
||||||
oeb = create_oebbook(log, None, opts, self,
|
oeb = create_oebbook(log, None, opts,
|
||||||
encoding=opts.input_encoding, populate=False)
|
encoding=opts.input_encoding, populate=False)
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
|
|
||||||
|
@ -42,6 +42,12 @@ option.
|
|||||||
For full documentation of the conversion system see
|
For full documentation of the conversion system see
|
||||||
''') + 'http://calibre-ebook.com/user_manual/conversion.html'
|
''') + 'http://calibre-ebook.com/user_manual/conversion.html'
|
||||||
|
|
||||||
|
HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
||||||
|
'italicize_common_cases', 'fix_indents',
|
||||||
|
'html_unwrap_factor', 'unwrap_lines',
|
||||||
|
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||||
|
'dehyphenate', 'renumber_headings']
|
||||||
|
|
||||||
def print_help(parser, log):
|
def print_help(parser, log):
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
help = parser.format_help().encode(preferred_encoding, 'replace')
|
||||||
log(help)
|
log(help)
|
||||||
@ -83,6 +89,8 @@ def option_recommendation_to_cli_option(add_option, rec):
|
|||||||
if opt.long_switch == 'verbose':
|
if opt.long_switch == 'verbose':
|
||||||
attrs['action'] = 'count'
|
attrs['action'] = 'count'
|
||||||
attrs.pop('type', '')
|
attrs.pop('type', '')
|
||||||
|
if opt.name in HEURISTIC_OPTIONS and rec.recommended_value is True:
|
||||||
|
switches = ['--disable-'+opt.long_switch]
|
||||||
add_option(Option(*switches, **attrs))
|
add_option(Option(*switches, **attrs))
|
||||||
|
|
||||||
def add_input_output_options(parser, plumber):
|
def add_input_output_options(parser, plumber):
|
||||||
@ -126,8 +134,24 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'margin_top', 'margin_left', 'margin_right',
|
'margin_top', 'margin_left', 'margin_right',
|
||||||
'margin_bottom', 'change_justification',
|
'margin_bottom', 'change_justification',
|
||||||
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
|
||||||
'asciiize', 'remove_header', 'header_regex',
|
'asciiize',
|
||||||
'remove_footer', 'footer_regex',
|
]
|
||||||
|
),
|
||||||
|
|
||||||
|
'HEURISTIC PROCESSING' : (
|
||||||
|
_('Modify the document text and structure using common'
|
||||||
|
' patterns. Disabled by default. Use %s to enable. '
|
||||||
|
' Individual actions can be disabled with the %s options.')
|
||||||
|
% ('--enable-heuristics', '--disable-*'),
|
||||||
|
['enable_heuristics'] + HEURISTIC_OPTIONS
|
||||||
|
),
|
||||||
|
|
||||||
|
'SEARCH AND REPLACE' : (
|
||||||
|
_('Modify the document text and structure using user defined patterns.'),
|
||||||
|
[
|
||||||
|
'sr1_search', 'sr1_replace',
|
||||||
|
'sr2_search', 'sr2_replace',
|
||||||
|
'sr3_search', 'sr3_replace',
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -137,7 +161,6 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'chapter', 'chapter_mark',
|
'chapter', 'chapter_mark',
|
||||||
'prefer_metadata_cover', 'remove_first_image',
|
'prefer_metadata_cover', 'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before',
|
||||||
'preprocess_html', 'html_unwrap_factor',
|
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -164,7 +187,8 @@ def add_pipeline_options(parser, plumber):
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
|
group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING',
|
||||||
|
'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
|
||||||
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
|
'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
|
||||||
|
|
||||||
for group in group_order:
|
for group in group_order:
|
||||||
|
@ -376,23 +376,6 @@ OptionRecommendation(name='insert_metadata',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='preprocess_html',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Attempt to detect and correct hard line breaks and other '
|
|
||||||
'problems in the source file. This may make things worse, so use '
|
|
||||||
'with care.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='html_unwrap_factor',
|
|
||||||
recommended_value=0.40, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Scale used to determine the length at which a line should '
|
|
||||||
'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
|
|
||||||
'default is 0.40, just below the median line length. This will unwrap typical books '
|
|
||||||
' with hard line breaks, but should be reduced if the line length is variable.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='smarten_punctuation',
|
OptionRecommendation(name='smarten_punctuation',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Convert plain quotes, dashes and ellipsis to their '
|
help=_('Convert plain quotes, dashes and ellipsis to their '
|
||||||
@ -401,32 +384,6 @@ OptionRecommendation(name='smarten_punctuation',
|
|||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='remove_header',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Use a regular expression to try and remove the header.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='header_regex',
|
|
||||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
|
||||||
level=OptionRecommendation.LOW,
|
|
||||||
help=_('The regular expression to use to remove the header.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='remove_footer',
|
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
|
||||||
help=_('Use a regular expression to try and remove the footer.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='footer_regex',
|
|
||||||
recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
|
|
||||||
level=OptionRecommendation.LOW,
|
|
||||||
help=_('The regular expression to use to remove the footer.'
|
|
||||||
)
|
|
||||||
),
|
|
||||||
|
|
||||||
OptionRecommendation(name='read_metadata_from_opf',
|
OptionRecommendation(name='read_metadata_from_opf',
|
||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
short_switch='m',
|
short_switch='m',
|
||||||
@ -527,6 +484,89 @@ OptionRecommendation(name='timestamp',
|
|||||||
recommended_value=None, level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
help=_('Set the book timestamp (used by the date column in calibre).')),
|
help=_('Set the book timestamp (used by the date column in calibre).')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='enable_heuristics',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Enable heuristic processing. This option must be set for any '
|
||||||
|
'heuristic processing to take place.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='markup_chapter_headings',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Detect unformatted chapter headings and sub headings. Change '
|
||||||
|
'them to h2 and h3 tags. This setting will not create a TOC, '
|
||||||
|
'but can be used in conjunction with structure detection to create '
|
||||||
|
'one.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='italicize_common_cases',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Look for common words and patterns that denote '
|
||||||
|
'italics and italicize them.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='fix_indents',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Turn indentation created from multiple non-breaking space entities '
|
||||||
|
'into CSS indents.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='html_unwrap_factor',
|
||||||
|
recommended_value=0.40, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Scale used to determine the length at which a line should '
|
||||||
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||||
|
'default is 0.4, just below the median line length. If only a '
|
||||||
|
'few lines in the document require unwrapping this value should '
|
||||||
|
'be reduced')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='unwrap_lines',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Unwrap lines using punctuation and other formatting clues.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='delete_blank_paragraphs',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Remove empty paragraphs from the document when they exist between '
|
||||||
|
'every other paragraph')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='format_scene_breaks',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Left aligned scene break markers are center aligned. '
|
||||||
|
'Replace soft scene breaks that use multiple blank lines with'
|
||||||
|
'horizontal rules.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='dehyphenate',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Analyze hyphenated words throughout the document. The '
|
||||||
|
'document itself is used as a dictionary to determine whether hyphens '
|
||||||
|
'should be retained or removed.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='renumber_headings',
|
||||||
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Looks for occurrences of sequential <h1> or <h2> tags. '
|
||||||
|
'The tags are renumbered to prevent splitting in the middle '
|
||||||
|
'of chapter headings.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr1_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr1-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr1_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replacement to replace the text found with sr1-search.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr2_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr2-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr2_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replacement to replace the text found with sr2-search.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr3_search',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
'sr3-replace.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='sr3_replace',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replacement to replace the text found with sr3-search.')),
|
||||||
]
|
]
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -861,7 +901,6 @@ OptionRecommendation(name='timestamp',
|
|||||||
self.opts_to_mi(self.user_metadata)
|
self.opts_to_mi(self.user_metadata)
|
||||||
if not hasattr(self.oeb, 'manifest'):
|
if not hasattr(self.oeb, 'manifest'):
|
||||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||||
self.input_plugin,
|
|
||||||
encoding=self.input_plugin.output_encoding)
|
encoding=self.input_plugin.output_encoding)
|
||||||
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
||||||
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
||||||
@ -971,14 +1010,13 @@ OptionRecommendation(name='timestamp',
|
|||||||
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
||||||
self.flush()
|
self.flush()
|
||||||
|
|
||||||
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
|
def create_oebbook(log, path_or_stream, opts, reader=None,
|
||||||
encoding='utf-8', populate=True):
|
encoding='utf-8', populate=True):
|
||||||
'''
|
'''
|
||||||
Create an OEBBook.
|
Create an OEBBook.
|
||||||
'''
|
'''
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
from calibre.ebooks.oeb.base import OEBBook
|
||||||
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
|
html_preprocessor = HTMLPreProcessor(log, opts)
|
||||||
opts.preprocess_html, opts)
|
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = None
|
encoding = None
|
||||||
oeb = OEBBook(log, html_preprocessor,
|
oeb = OEBBook(log, html_preprocessor,
|
||||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import functools, re
|
import functools, re
|
||||||
|
|
||||||
from calibre import entity_to_unicode
|
from calibre import entity_to_unicode, as_unicode
|
||||||
|
|
||||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||||
@ -174,13 +174,19 @@ class Dehyphenator(object):
|
|||||||
retain hyphens.
|
retain hyphens.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, verbose=0, log=None):
|
||||||
|
self.log = log
|
||||||
|
self.verbose = verbose
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
|
# only remove if it's not already the point of hyphenation
|
||||||
|
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
|
||||||
|
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||||
|
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
# remove prefixes if the prefix was not already the point of hyphenation
|
||||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||||
|
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||||
|
|
||||||
def dehyphenate(self, match):
|
def dehyphenate(self, match):
|
||||||
firsthalf = match.group('firstpart')
|
firsthalf = match.group('firstpart')
|
||||||
@ -191,31 +197,44 @@ class Dehyphenator(object):
|
|||||||
wraptags = ''
|
wraptags = ''
|
||||||
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
||||||
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
||||||
|
if self.suffixes.match(secondhalf) is None:
|
||||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||||
if self.prefixes.match(firsthalf) is None:
|
else:
|
||||||
|
lookupword = dehyphenated
|
||||||
|
if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
|
||||||
lookupword = self.removeprefix.sub('', lookupword)
|
lookupword = self.removeprefix.sub('', lookupword)
|
||||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
|
||||||
try:
|
try:
|
||||||
searchresult = self.html.find(lookupword.lower())
|
searchresult = self.html.find(lookupword.lower())
|
||||||
except:
|
except:
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returned dehyphenated word: " + str(dehyphenated))
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
elif self.html.find(hyphenated) != -1:
|
elif self.html.find(hyphenated) != -1:
|
||||||
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returned hyphenated word: " + str(hyphenated))
|
||||||
return hyphenated
|
return hyphenated
|
||||||
else:
|
else:
|
||||||
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
|
||||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||||
|
if self.verbose > 2:
|
||||||
|
self.log("too short, returned hyphenated word: " + str(hyphenated))
|
||||||
|
return hyphenated
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" returned dehyphenated word: " + str(dehyphenated))
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
else:
|
else:
|
||||||
#print " returned hyphenated word: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" returned hyphenated word: " + str(hyphenated))
|
||||||
return hyphenated
|
return hyphenated
|
||||||
|
|
||||||
def __call__(self, html, format, length=1):
|
def __call__(self, html, format, length=1):
|
||||||
@ -228,7 +247,7 @@ class Dehyphenator(object):
|
|||||||
elif format == 'txt':
|
elif format == 'txt':
|
||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
|
||||||
elif format == 'html_cleanup':
|
elif format == 'html_cleanup':
|
||||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||||
elif format == 'txt_cleanup':
|
elif format == 'txt_cleanup':
|
||||||
@ -397,10 +416,8 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||||
]
|
]
|
||||||
def __init__(self, input_plugin_preprocess, plugin_preprocess,
|
def __init__(self, log=None, extra_opts=None):
|
||||||
extra_opts=None):
|
self.log = log
|
||||||
self.input_plugin_preprocess = input_plugin_preprocess
|
|
||||||
self.plugin_preprocess = plugin_preprocess
|
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
|
||||||
def is_baen(self, src):
|
def is_baen(self, src):
|
||||||
@ -436,27 +453,20 @@ class HTMLPreProcessor(object):
|
|||||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||||
|
|
||||||
|
for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
|
||||||
|
search_pattern = getattr(self.extra_opts, search, '')
|
||||||
|
if search_pattern:
|
||||||
|
try:
|
||||||
|
search_re = re.compile(search_pattern)
|
||||||
|
replace_txt = getattr(self.extra_opts, replace, '')
|
||||||
|
if not replace_txt:
|
||||||
|
replace_txt = ''
|
||||||
|
rules.insert(0, (search_re, replace_txt))
|
||||||
|
except Exception as e:
|
||||||
|
self.log.error('Failed to parse %r regexp because %s' %
|
||||||
|
(search, as_unicode(e)))
|
||||||
|
|
||||||
end_rules = []
|
end_rules = []
|
||||||
if getattr(self.extra_opts, 'remove_header', None):
|
|
||||||
try:
|
|
||||||
rules.insert(0,
|
|
||||||
(re.compile(self.extra_opts.header_regex), lambda match : '')
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print 'Failed to parse remove_header regexp'
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'remove_footer', None):
|
|
||||||
try:
|
|
||||||
rules.insert(0,
|
|
||||||
(re.compile(self.extra_opts.footer_regex), lambda match : '')
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
print 'Failed to parse remove_footer regexp'
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
# unwrap/delete soft hyphens
|
# unwrap/delete soft hyphens
|
||||||
@ -464,12 +474,6 @@ class HTMLPreProcessor(object):
|
|||||||
# unwrap/delete soft hyphens with formatting
|
# unwrap/delete soft hyphens with formatting
|
||||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
|
||||||
# reduce false positives and move after header/footer removal
|
|
||||||
if getattr(self.extra_opts, 'preprocess_html', None):
|
|
||||||
if is_pdftohtml:
|
|
||||||
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
|
|
||||||
|
|
||||||
length = -1
|
length = -1
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
docanalysis = DocAnalysis('pdf', html)
|
docanalysis = DocAnalysis('pdf', html)
|
||||||
@ -512,15 +516,14 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
if is_pdftohtml and length > -1:
|
if is_pdftohtml and length > -1:
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
|
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
pdf_markup = PreProcessor(self.extra_opts, None)
|
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||||
totalwords = 0
|
totalwords = 0
|
||||||
totalwords = pdf_markup.get_word_count(html)
|
if pdf_markup.get_word_count(html) > 7000:
|
||||||
if totalwords > 7000:
|
|
||||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||||
|
|
||||||
#dump(html, 'post-preprocess')
|
#dump(html, 'post-preprocess')
|
||||||
@ -540,8 +543,10 @@ class HTMLPreProcessor(object):
|
|||||||
unidecoder = Unidecoder()
|
unidecoder = Unidecoder()
|
||||||
html = unidecoder.decode(html)
|
html = unidecoder.decode(html)
|
||||||
|
|
||||||
if self.plugin_preprocess:
|
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||||
html = self.input_plugin_preprocess(self.extra_opts, html)
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
|
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||||
|
html = preprocessor(html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||||
html = self.smarten_punctuation(html)
|
html = self.smarten_punctuation(html)
|
||||||
|
@ -11,13 +11,22 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
|||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.wordcount import get_wordcount_obj
|
from calibre.utils.wordcount import get_wordcount_obj
|
||||||
|
|
||||||
class PreProcessor(object):
|
class HeuristicProcessor(object):
|
||||||
|
|
||||||
def __init__(self, extra_opts=None, log=None):
|
def __init__(self, extra_opts=None, log=None):
|
||||||
self.log = default_log if log is None else log
|
self.log = default_log if log is None else log
|
||||||
self.html_preprocess_sections = 0
|
self.html_preprocess_sections = 0
|
||||||
self.found_indents = 0
|
self.found_indents = 0
|
||||||
self.extra_opts = extra_opts
|
self.extra_opts = extra_opts
|
||||||
|
self.deleted_nbsps = False
|
||||||
|
self.totalwords = 0
|
||||||
|
self.min_chapters = 1
|
||||||
|
self.chapters_no_title = 0
|
||||||
|
self.chapters_with_title = 0
|
||||||
|
self.blanks_deleted = False
|
||||||
|
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
@ -27,12 +36,12 @@ class PreProcessor(object):
|
|||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" chapters. - " + unicode(chap))
|
" chapters. - " + unicode(chap))
|
||||||
return '<h2>'+chap+'</h2>\n'
|
return '<h2>'+chap+'</h2>\n'
|
||||||
else:
|
else:
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
||||||
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
@ -40,10 +49,18 @@ class PreProcessor(object):
|
|||||||
chap = match.group('section')
|
chap = match.group('section')
|
||||||
styles = match.group('styles')
|
styles = match.group('styles')
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" section markers based on punctuation. - " + unicode(chap))
|
" section markers based on punctuation. - " + unicode(chap))
|
||||||
return '<'+styles+' style="page-break-before:always">'+chap
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
|
|
||||||
|
def analyze_title_matches(self, match):
|
||||||
|
#chap = match.group('chap')
|
||||||
|
title = match.group('title')
|
||||||
|
if not title:
|
||||||
|
self.chapters_no_title = self.chapters_no_title + 1
|
||||||
|
else:
|
||||||
|
self.chapters_with_title = self.chapters_with_title + 1
|
||||||
|
|
||||||
def insert_indent(self, match):
|
def insert_indent(self, match):
|
||||||
pstyle = match.group('formatting')
|
pstyle = match.group('formatting')
|
||||||
span = match.group('span')
|
span = match.group('span')
|
||||||
@ -75,8 +92,8 @@ class PreProcessor(object):
|
|||||||
line_end = line_end_ere.findall(raw)
|
line_end = line_end_ere.findall(raw)
|
||||||
tot_htm_ends = len(htm_end)
|
tot_htm_ends = len(htm_end)
|
||||||
tot_ln_fds = len(line_end)
|
tot_ln_fds = len(line_end)
|
||||||
self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
#self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
|
||||||
unicode(tot_htm_ends) + " marked up endings")
|
# unicode(tot_htm_ends) + " marked up endings")
|
||||||
|
|
||||||
if percent > 1:
|
if percent > 1:
|
||||||
percent = 1
|
percent = 1
|
||||||
@ -84,9 +101,8 @@ class PreProcessor(object):
|
|||||||
percent = 0
|
percent = 0
|
||||||
|
|
||||||
min_lns = tot_ln_fds * percent
|
min_lns = tot_ln_fds * percent
|
||||||
self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
#self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
|
||||||
if min_lns > tot_htm_ends:
|
return min_lns > tot_htm_ends
|
||||||
return True
|
|
||||||
|
|
||||||
def dump(self, raw, where):
|
def dump(self, raw, where):
|
||||||
import os
|
import os
|
||||||
@ -112,16 +128,55 @@ class PreProcessor(object):
|
|||||||
wordcount = get_wordcount_obj(word_count_text)
|
wordcount = get_wordcount_obj(word_count_text)
|
||||||
return wordcount.words
|
return wordcount.words
|
||||||
|
|
||||||
|
def markup_italicis(self, html):
|
||||||
|
ITALICIZE_WORDS = [
|
||||||
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||||
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
|
]
|
||||||
|
|
||||||
|
ITALICIZE_STYLE_PATS = [
|
||||||
|
r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
|
||||||
|
r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
|
||||||
|
]
|
||||||
|
|
||||||
|
for word in ITALICIZE_WORDS:
|
||||||
|
html = html.replace(word, '<i>%s</i>' % word)
|
||||||
|
|
||||||
|
for pat in ITALICIZE_STYLE_PATS:
|
||||||
|
html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||||
|
'''
|
||||||
|
Searches for common chapter headings throughout the document
|
||||||
|
attempts multiple patterns based on likelihood of a match
|
||||||
|
with minimum false positives. Exits after finding a successful pattern
|
||||||
|
'''
|
||||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||||
# minimum of chapters to search for
|
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||||
self.min_chapters = 1
|
# or pdf page numbers from being treated as TOC markers
|
||||||
|
max_chapters = 150
|
||||||
|
typical_chapters = 7000.
|
||||||
if wordcount > 7000:
|
if wordcount > 7000:
|
||||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
if wordcount > 200000:
|
||||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
typical_chapters = 15000.
|
||||||
|
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||||
|
self.log.debug("minimum chapters required are: "+str(self.min_chapters))
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
|
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = "(?=<(p|div))"
|
init_lookahead = "(?=<(p|div))"
|
||||||
@ -151,103 +206,160 @@ class PreProcessor(object):
|
|||||||
n_lookahead_open = "\s+(?!"
|
n_lookahead_open = "\s+(?!"
|
||||||
n_lookahead_close = ")"
|
n_lookahead_close = ")"
|
||||||
|
|
||||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
|
||||||
|
analysis_result = []
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
|
||||||
[r"([A-Z-]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
||||||
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
||||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
||||||
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
|
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
||||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
||||||
|
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def recurse_patterns(html, analyze):
|
||||||
# Start with most typical chapter headings, get more aggressive until one works
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
|
||||||
|
n_lookahead = ''
|
||||||
|
hits = 0
|
||||||
|
self.chapters_no_title = 0
|
||||||
|
self.chapters_with_title = 0
|
||||||
|
|
||||||
|
if n_lookahead_req:
|
||||||
|
lp_n_lookahead_open = n_lookahead_open
|
||||||
|
lp_n_lookahead_close = n_lookahead_close
|
||||||
|
else:
|
||||||
|
lp_n_lookahead_open = ''
|
||||||
|
lp_n_lookahead_close = ''
|
||||||
|
|
||||||
|
if strict_title:
|
||||||
|
lp_title = default_title
|
||||||
|
else:
|
||||||
|
lp_title = simple_title
|
||||||
|
|
||||||
|
if ignorecase:
|
||||||
|
arg_ignorecase = r'(?i)'
|
||||||
|
else:
|
||||||
|
arg_ignorecase = ''
|
||||||
|
|
||||||
|
if title_req:
|
||||||
|
lp_opt_title_open = ''
|
||||||
|
lp_opt_title_close = ''
|
||||||
|
else:
|
||||||
|
lp_opt_title_open = opt_title_open
|
||||||
|
lp_opt_title_close = opt_title_close
|
||||||
|
|
||||||
if self.html_preprocess_sections >= self.min_chapters:
|
if self.html_preprocess_sections >= self.min_chapters:
|
||||||
break
|
break
|
||||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||||
|
if n_lookahead_req:
|
||||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
if not analyze:
|
||||||
if lookahead_ignorecase:
|
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||||
|
chapdetect = re.compile(r'%s' % chapter_marker)
|
||||||
|
|
||||||
|
if analyze:
|
||||||
|
hits = len(chapdetect.findall(html))
|
||||||
|
if hits:
|
||||||
|
chapdetect.sub(self.analyze_title_matches, html)
|
||||||
|
if float(self.chapters_with_title) / float(hits) > .5:
|
||||||
|
title_req = True
|
||||||
|
strict_title = False
|
||||||
|
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||||
|
if type_name == 'common':
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
elif self.min_chapters <= hits < max_chapters:
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
|
||||||
html = chapdetect.sub(self.chapter_head, html)
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
recurse_patterns(html, True)
|
||||||
|
chapter_types = analysis_result
|
||||||
|
html = recurse_patterns(html, False)
|
||||||
|
|
||||||
words_per_chptr = wordcount
|
words_per_chptr = wordcount
|
||||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||||
words_per_chptr = wordcount / self.html_preprocess_sections
|
words_per_chptr = wordcount / self.html_preprocess_sections
|
||||||
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def punctuation_unwrap(self, length, content, format):
|
def punctuation_unwrap(self, length, content, format):
|
||||||
|
'''
|
||||||
|
Unwraps lines based on line length and punctuation
|
||||||
|
supports a range of html markup and text files
|
||||||
|
'''
|
||||||
# define the pieces of the regex
|
# define the pieces of the regex
|
||||||
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||||
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
|
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
||||||
|
soft_hyphen = u"\xad"
|
||||||
|
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
||||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||||
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
|
||||||
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||||
|
|
||||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||||
|
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
||||||
|
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
||||||
|
|
||||||
if format == 'txt':
|
if format == 'txt':
|
||||||
unwrap_regex = lookahead+txt_line_wrap
|
unwrap_regex = lookahead+txt_line_wrap
|
||||||
|
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||||
|
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||||
|
|
||||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||||
|
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||||
|
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||||
|
|
||||||
content = unwrap.sub(' ', content)
|
content = unwrap.sub(' ', content)
|
||||||
|
content = em_en_unwrap.sub('', content)
|
||||||
|
content = shy_unwrap.sub('', content)
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
def txt_process(self, match):
|
||||||
def __call__(self, html):
|
|
||||||
self.log("********* Preprocessing HTML *********")
|
|
||||||
|
|
||||||
# Count the words in the document to estimate how many chapters to look for and whether
|
|
||||||
# other types of processing are attempted
|
|
||||||
totalwords = 0
|
|
||||||
totalwords = self.get_word_count(html)
|
|
||||||
|
|
||||||
if totalwords < 50:
|
|
||||||
self.log("not enough text, not preprocessing")
|
|
||||||
return html
|
|
||||||
|
|
||||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
|
||||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
|
||||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
|
||||||
|
|
||||||
###### Check Markup ######
|
|
||||||
#
|
|
||||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
|
||||||
# <pre> tags), check and mark up line endings if required before proceeding
|
|
||||||
if self.no_markup(html, 0.1):
|
|
||||||
self.log("not enough paragraph markers, adding now")
|
|
||||||
# check if content is in pre tags, use txt processor to mark up if so
|
|
||||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
|
||||||
if len(pre.findall(html)) == 1:
|
|
||||||
self.log("Running Text Processing")
|
|
||||||
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||||
separate_paragraphs_single_line
|
separate_paragraphs_single_line
|
||||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
content = match.group('text')
|
||||||
html = outerhtml.sub('\g<text>', html)
|
content = separate_paragraphs_single_line(content)
|
||||||
html = separate_paragraphs_single_line(html)
|
content = preserve_spaces(content)
|
||||||
html = preserve_spaces(html)
|
content = convert_basic(content, epub_split_size_kb=0)
|
||||||
html = convert_basic(html, epub_split_size_kb=0)
|
return content
|
||||||
|
|
||||||
|
def markup_pre(self, html):
|
||||||
|
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||||
|
if len(pre.findall(html)) >= 1:
|
||||||
|
self.log.debug("Running Text Processing")
|
||||||
|
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
|
||||||
|
html = outerhtml.sub(self.txt_process, html)
|
||||||
else:
|
else:
|
||||||
# Add markup naively
|
# Add markup naively
|
||||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||||
# other types of unmarked html and handle them in some better fashion
|
# other types of unmarked html and handle them in some better fashion
|
||||||
add_markup = re.compile('(?<!>)(\n)')
|
add_markup = re.compile('(?<!>)(\n)')
|
||||||
html = add_markup.sub('</p>\n<p>', html)
|
html = add_markup.sub('</p>\n<p>', html)
|
||||||
|
return html
|
||||||
|
|
||||||
###### Mark Indents/Cleanup ######
|
def arrange_htm_line_endings(self, html):
|
||||||
#
|
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
||||||
# Replace series of non-breaking spaces with text-indent
|
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def fix_nbsp_indents(self, html):
|
||||||
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
||||||
html = txtindent.sub(self.insert_indent, html)
|
html = txtindent.sub(self.insert_indent, html)
|
||||||
if self.found_indents > 1:
|
if self.found_indents > 1:
|
||||||
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
||||||
|
return html
|
||||||
|
|
||||||
|
def cleanup_markup(self, html):
|
||||||
# remove remaining non-breaking spaces
|
# remove remaining non-breaking spaces
|
||||||
html = re.sub(ur'\u00a0', ' ', html)
|
html = re.sub(ur'\u00a0', ' ', html)
|
||||||
# Get rid of various common microsoft specific tags which can cause issues later
|
# Get rid of various common microsoft specific tags which can cause issues later
|
||||||
@ -255,108 +367,166 @@ class PreProcessor(object):
|
|||||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
# Delete microsoft 'smart' tags
|
# Delete microsoft 'smart' tags
|
||||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||||
# Get rid of empty span, bold, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
# ADE doesn't render <br />, change to empty paragraphs
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
self.deleted_nbsps = True
|
||||||
|
return html
|
||||||
|
|
||||||
# If more than 40% of the lines are empty paragraphs and the user has enabled remove
|
def analyze_line_endings(self, html):
|
||||||
# paragraph spacing then delete blank lines to clean up spacing
|
'''
|
||||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
determines the type of html line ending used most commonly in a document
|
||||||
blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
use before calling docanalysis functions
|
||||||
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
|
'''
|
||||||
blanklines = blankreg.findall(html)
|
|
||||||
lines = linereg.findall(html)
|
|
||||||
blanks_between_paragraphs = False
|
|
||||||
if len(lines) > 1:
|
|
||||||
self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
|
|
||||||
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
|
||||||
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
|
|
||||||
'remove_paragraph_spacing', False):
|
|
||||||
self.log("deleting blank lines")
|
|
||||||
html = blankreg.sub('', html)
|
|
||||||
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
|
||||||
blanks_between_paragraphs = True
|
|
||||||
#print "blanks between paragraphs is marked True"
|
|
||||||
else:
|
|
||||||
blanks_between_paragraphs = False
|
|
||||||
|
|
||||||
#self.dump(html, 'before_chapter_markup')
|
|
||||||
# detect chapters/sections to match xpath or splitting logic
|
|
||||||
#
|
|
||||||
|
|
||||||
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
|
|
||||||
|
|
||||||
|
|
||||||
###### Unwrap lines ######
|
|
||||||
#
|
|
||||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
|
||||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
|
||||||
# that lines can be un-wrapped across page boundaries
|
|
||||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||||
paras = len(paras_reg.findall(html))
|
paras = len(paras_reg.findall(html))
|
||||||
spans = len(spans_reg.findall(html))
|
spans = len(spans_reg.findall(html))
|
||||||
if spans > 1:
|
if spans > 1:
|
||||||
if float(paras) / float(spans) < 0.75:
|
if float(paras) / float(spans) < 0.75:
|
||||||
format = 'spanned_html'
|
return 'spanned_html'
|
||||||
else:
|
else:
|
||||||
format = 'html'
|
return 'html'
|
||||||
else:
|
else:
|
||||||
format = 'html'
|
return 'html'
|
||||||
|
|
||||||
|
def analyze_blanks(self, html):
|
||||||
|
blanklines = self.blankreg.findall(html)
|
||||||
|
lines = self.linereg.findall(html)
|
||||||
|
if len(lines) > 1:
|
||||||
|
self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
|
||||||
|
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||||
|
|
||||||
|
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def cleanup_required(self):
|
||||||
|
for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
|
||||||
|
if getattr(self.extra_opts, option, False):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, html):
|
||||||
|
self.log.debug("********* Heuristic processing HTML *********")
|
||||||
|
|
||||||
|
# Count the words in the document to estimate how many chapters to look for and whether
|
||||||
|
# other types of processing are attempted
|
||||||
|
try:
|
||||||
|
self.totalwords = self.get_word_count(html)
|
||||||
|
except:
|
||||||
|
self.log.warn("Can't get wordcount")
|
||||||
|
|
||||||
|
if self.totalwords < 50:
|
||||||
|
self.log.warn("flow is too short, not running heuristics")
|
||||||
|
return html
|
||||||
|
|
||||||
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
|
html = self.arrange_htm_line_endings(html)
|
||||||
|
|
||||||
|
if self.cleanup_required():
|
||||||
|
###### Check Markup ######
|
||||||
|
#
|
||||||
|
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||||
|
# <pre> tags), check and mark up line endings if required before proceeding
|
||||||
|
# fix indents must run after this step
|
||||||
|
if self.no_markup(html, 0.1):
|
||||||
|
self.log.debug("not enough paragraph markers, adding now")
|
||||||
|
# markup using text processing
|
||||||
|
html = self.markup_pre(html)
|
||||||
|
|
||||||
|
# Replace series of non-breaking spaces with text-indent
|
||||||
|
if getattr(self.extra_opts, 'fix_indents', False):
|
||||||
|
html = self.fix_nbsp_indents(html)
|
||||||
|
|
||||||
|
if self.cleanup_required():
|
||||||
|
# fix indents must run before this step, as it removes non-breaking spaces
|
||||||
|
html = self.cleanup_markup(html)
|
||||||
|
|
||||||
|
# ADE doesn't render <br />, change to empty paragraphs
|
||||||
|
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||||
|
|
||||||
|
# Determine whether the document uses interleaved blank lines
|
||||||
|
blanks_between_paragraphs = self.analyze_blanks(html)
|
||||||
|
|
||||||
|
#self.dump(html, 'before_chapter_markup')
|
||||||
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
|
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||||
|
html = self.markup_italicis(html)
|
||||||
|
|
||||||
|
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||||
|
# blank paragraphs then delete blank lines to clean up spacing
|
||||||
|
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
|
self.log.debug("deleting blank lines")
|
||||||
|
self.blanks_deleted = True
|
||||||
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
|
# Determine line ending type
|
||||||
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
|
# that lines can be un-wrapped across page boundaries
|
||||||
|
format = self.analyze_line_endings(html)
|
||||||
|
|
||||||
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
||||||
# more of the lines break in the same region of the document then unwrapping is required
|
# more of the lines break in the same region of the document then unwrapping is required
|
||||||
docanalysis = DocAnalysis(format, html)
|
docanalysis = DocAnalysis(format, html)
|
||||||
hardbreaks = docanalysis.line_histogram(.50)
|
hardbreaks = docanalysis.line_histogram(.50)
|
||||||
self.log("Hard line breaks check returned "+unicode(hardbreaks))
|
self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))
|
||||||
|
|
||||||
# Calculate Length
|
# Calculate Length
|
||||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||||
length = docanalysis.line_length(unwrap_factor)
|
length = docanalysis.line_length(unwrap_factor)
|
||||||
self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
||||||
|
|
||||||
|
###### Unwrap lines ######
|
||||||
|
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||||
if hardbreaks or unwrap_factor < 0.4:
|
if hardbreaks or unwrap_factor < 0.4:
|
||||||
self.log("Unwrapping required, unwrapping Lines")
|
self.log.debug("Unwrapping required, unwrapping Lines")
|
||||||
# Unwrap em/en dashes
|
# Dehyphenate with line length limiters
|
||||||
html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
# Dehyphenate
|
|
||||||
self.log("Unwrapping/Removing hyphens")
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
self.log("Done dehyphenating")
|
|
||||||
# Unwrap lines using punctation and line length
|
|
||||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
|
||||||
html = self.punctuation_unwrap(length, html, 'html')
|
html = self.punctuation_unwrap(length, html, 'html')
|
||||||
#check any remaining hyphens, but only unwrap if there is a match
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
|
||||||
else:
|
|
||||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
|
||||||
self.log("Cleaning up hyphenation")
|
|
||||||
dehyphenator = Dehyphenator()
|
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
|
||||||
self.log("Done dehyphenating")
|
|
||||||
|
|
||||||
# delete soft hyphens
|
if getattr(self.extra_opts, 'dehyphenate', False):
|
||||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||||
|
self.log.debug("Fixing hyphenated content")
|
||||||
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
html = dehyphenator(html, 'individual_words', length)
|
||||||
|
|
||||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
if self.html_preprocess_sections < self.min_chapters:
|
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
self.log("Looking for more split points based on punctuation,"
|
self.log.debug("Looking for more split points based on punctuation,"
|
||||||
" currently have " + unicode(self.html_preprocess_sections))
|
" currently have " + unicode(self.html_preprocess_sections))
|
||||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
# headings and titles, images, etc
|
# headings and titles, images, etc
|
||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
|
||||||
|
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
||||||
|
if not self.blanks_deleted:
|
||||||
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
|
html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
|
||||||
|
|
||||||
|
if self.deleted_nbsps:
|
||||||
|
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||||
|
html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
@ -21,10 +21,9 @@ from calibre.customize.conversion import InputFormatPlugin
|
|||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
from calibre.constants import islinux, isfreebsd, iswindows
|
from calibre.constants import islinux, isfreebsd, iswindows
|
||||||
from calibre import unicode_path
|
from calibre import unicode_path, as_unicode
|
||||||
from calibre.utils.localization import get_lang
|
from calibre.utils.localization import get_lang
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
'''
|
'''
|
||||||
@ -112,7 +111,7 @@ class HTMLFile(object):
|
|||||||
with open(self.path, 'rb') as f:
|
with open(self.path, 'rb') as f:
|
||||||
src = f.read()
|
src = f.read()
|
||||||
except IOError, err:
|
except IOError, err:
|
||||||
msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
|
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
|
||||||
if level == 0:
|
if level == 0:
|
||||||
raise IOError(msg)
|
raise IOError(msg)
|
||||||
raise IgnoreFile(msg, err.errno)
|
raise IgnoreFile(msg, err.errno)
|
||||||
@ -296,7 +295,7 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
return oeb
|
return oeb
|
||||||
|
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
return create_oebbook(log, stream.name, opts, self,
|
return create_oebbook(log, stream.name, opts,
|
||||||
encoding=opts.input_encoding)
|
encoding=opts.input_encoding)
|
||||||
|
|
||||||
def is_case_sensitive(self, path):
|
def is_case_sensitive(self, path):
|
||||||
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
self.log.exception('Failed to read CSS file: %r'%link)
|
self.log.exception('Failed to read CSS file: %r'%link)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
return (None, raw)
|
return (None, raw)
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
@ -7,8 +7,6 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
|
|
||||||
class LITInput(InputFormatPlugin):
|
class LITInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -22,7 +20,7 @@ class LITInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.lit.reader import LitReader
|
from calibre.ebooks.lit.reader import LitReader
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
self.log = log
|
self.log = log
|
||||||
return create_oebbook(log, stream, options, self, reader=LitReader)
|
return create_oebbook(log, stream, options, reader=LitReader)
|
||||||
|
|
||||||
def postprocess_book(self, oeb, opts, log):
|
def postprocess_book(self, oeb, opts, log):
|
||||||
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
||||||
@ -39,10 +37,13 @@ class LITInput(InputFormatPlugin):
|
|||||||
body = body[0]
|
body = body[0]
|
||||||
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
||||||
pre = body[0]
|
pre = body[0]
|
||||||
from calibre.ebooks.txt.processor import convert_basic
|
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||||
|
separate_paragraphs_single_line
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import copy
|
import copy
|
||||||
html = convert_basic(pre.text).replace('<html>',
|
html = separate_paragraphs_single_line(pre.text)
|
||||||
|
html = preserve_spaces(html)
|
||||||
|
html = convert_basic(html).replace('<html>',
|
||||||
'<html xmlns="%s">'%XHTML_NS)
|
'<html xmlns="%s">'%XHTML_NS)
|
||||||
root = etree.fromstring(html)
|
root = etree.fromstring(html)
|
||||||
body = XPath('//h:body')(root)
|
body = XPath('//h:body')(root)
|
||||||
@ -51,10 +52,3 @@ class LITInput(InputFormatPlugin):
|
|||||||
for elem in body:
|
for elem in body:
|
||||||
ne = copy.deepcopy(elem)
|
ne = copy.deepcopy(elem)
|
||||||
pre.append(ne)
|
pre.append(ne)
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ from copy import deepcopy
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
from calibre import guess_type
|
from calibre import guess_type
|
||||||
|
|
||||||
class Canvas(etree.XSLTExtension):
|
class Canvas(etree.XSLTExtension):
|
||||||
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
|
|||||||
f.write(result)
|
f.write(result)
|
||||||
styles.write()
|
styles.write()
|
||||||
return os.path.abspath('content.opf')
|
return os.path.abspath('content.opf')
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
|
||||||
class MOBIInput(InputFormatPlugin):
|
class MOBIInput(InputFormatPlugin):
|
||||||
@ -39,11 +38,3 @@ class MOBIInput(InputFormatPlugin):
|
|||||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||||
return mr.created_opf_path
|
return mr.created_opf_path
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
|
||||||
# headings and titles, images, etc
|
|
||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
@ -199,8 +199,8 @@ class EbookIterator(object):
|
|||||||
not hasattr(self.pathtoopf, 'manifest'):
|
not hasattr(self.pathtoopf, 'manifest'):
|
||||||
if hasattr(self.pathtoopf, 'manifest'):
|
if hasattr(self.pathtoopf, 'manifest'):
|
||||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
|
self.pathtoopf = create_oebbook(self.log, self.pathtoopf,
|
||||||
plumber.input_plugin)
|
plumber.opts)
|
||||||
|
|
||||||
if hasattr(self.pathtoopf, 'manifest'):
|
if hasattr(self.pathtoopf, 'manifest'):
|
||||||
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
|
||||||
|
@ -9,7 +9,6 @@ import os
|
|||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
class PDBInput(InputFormatPlugin):
|
class PDBInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -32,8 +31,3 @@ class PDBInput(InputFormatPlugin):
|
|||||||
opf = reader.extract_content(os.getcwd())
|
opf = reader.extract_content(os.getcwd())
|
||||||
|
|
||||||
return opf
|
return opf
|
||||||
|
|
||||||
def preprocess_html(self, options, html):
|
|
||||||
self.options = options
|
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
||||||
return preprocessor(html)
|
|
||||||
|
@ -7,7 +7,6 @@ import os, glob, re, textwrap
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
|
|
||||||
border_style_map = {
|
border_style_map = {
|
||||||
'single' : 'solid',
|
'single' : 'solid',
|
||||||
@ -319,13 +318,9 @@ class RTFInput(InputFormatPlugin):
|
|||||||
res = transform.tostring(result)
|
res = transform.tostring(result)
|
||||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||||
if not getattr(self.opts, 'remove_paragraph_spacing', False):
|
|
||||||
res = re.sub('\s*<body>', '<body>', res)
|
res = re.sub('\s*<body>', '<body>', res)
|
||||||
res = re.sub('(?<=\n)\n{2}',
|
res = re.sub('(?<=\n)\n{2}',
|
||||||
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||||
if self.opts.preprocess_html:
|
|
||||||
preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
|
|
||||||
res = preprocessor(res.decode('utf-8')).encode('utf-8')
|
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class, border_styles)
|
self.write_inline_css(inline_class, border_styles)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
@ -41,7 +41,7 @@ class SNBInput(InputFormatPlugin):
|
|||||||
raise ValueError("Invalid SNB file")
|
raise ValueError("Invalid SNB file")
|
||||||
log.debug("Handle meta data ...")
|
log.debug("Handle meta data ...")
|
||||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||||
oeb = create_oebbook(log, None, options, self,
|
oeb = create_oebbook(log, None, options,
|
||||||
encoding=options.input_encoding, populate=False)
|
encoding=options.input_encoding, populate=False)
|
||||||
meta = snbFile.GetFileStream('snbf/book.snbf')
|
meta = snbFile.GetFileStream('snbf/book.snbf')
|
||||||
if meta != None:
|
if meta != None:
|
||||||
|
@ -1,58 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre import prepare_string_for_xml
|
|
||||||
|
|
||||||
class TXTHeuristicProcessor(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.ITALICIZE_WORDS = [
|
|
||||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
|
||||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
|
||||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
|
||||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
|
||||||
]
|
|
||||||
self.ITALICIZE_STYLE_PATS = [
|
|
||||||
r'(?msu)_(?P<words>.+?)_',
|
|
||||||
r'(?msu)/(?P<words>[^<>]+?)/',
|
|
||||||
r'(?msu)~~(?P<words>.+?)~~',
|
|
||||||
r'(?msu)\*(?P<words>.+?)\*',
|
|
||||||
r'(?msu)~(?P<words>.+?)~',
|
|
||||||
r'(?msu)_/(?P<words>[^<>]+?)/_',
|
|
||||||
r'(?msu)_\*(?P<words>.+?)\*_',
|
|
||||||
r'(?msu)\*/(?P<words>[^<>]+?)/\*',
|
|
||||||
r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
|
|
||||||
r'(?msu)/:(?P<words>[^<>]+?):/',
|
|
||||||
r'(?msu)\|:(?P<words>.+?):\|',
|
|
||||||
]
|
|
||||||
|
|
||||||
def process_paragraph(self, paragraph):
|
|
||||||
for word in self.ITALICIZE_WORDS:
|
|
||||||
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
|
|
||||||
for pat in self.ITALICIZE_STYLE_PATS:
|
|
||||||
paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
|
|
||||||
return paragraph
|
|
||||||
|
|
||||||
def convert(self, txt, title='', epub_split_size_kb=0):
|
|
||||||
from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
|
|
||||||
txt = clean_txt(txt)
|
|
||||||
txt = split_txt(txt, epub_split_size_kb)
|
|
||||||
|
|
||||||
processed = []
|
|
||||||
for line in txt.split('\n\n'):
|
|
||||||
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
|
|
||||||
|
|
||||||
txt = u'\n'.join(processed)
|
|
||||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
|
||||||
html = HTML_TEMPLATE % (title, txt)
|
|
||||||
|
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
pp = PreProcessor()
|
|
||||||
html = pp.markup_chapters(html, pp.get_word_count(html), False)
|
|
||||||
|
|
||||||
return html
|
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
convert_heuristic, normalize_line_endings, convert_textile
|
normalize_line_endings, convert_textile
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -53,6 +53,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
|
self.log = log
|
||||||
log.debug('Reading text from file...')
|
log.debug('Reading text from file...')
|
||||||
|
|
||||||
txt = stream.read()
|
txt = stream.read()
|
||||||
@ -106,7 +107,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||||
|
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||||
txt = dehyphenator(txt,'txt', length)
|
txt = dehyphenator(txt,'txt', length)
|
||||||
|
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
@ -118,24 +119,24 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = separate_paragraphs_print_formatted(txt)
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
|
|
||||||
if options.paragraph_type == 'unformatted':
|
if options.paragraph_type == 'unformatted':
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
# get length
|
# get length
|
||||||
|
|
||||||
# unwrap lines based on punctuation
|
# unwrap lines based on punctuation
|
||||||
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
|
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
|
|
||||||
if options.formatting_type == 'heuristic':
|
|
||||||
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
|
||||||
else:
|
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
if options.formatting_type == 'heuristic':
|
||||||
dehyphenator = Dehyphenator()
|
setattr(options, 'enable_heuristics', True)
|
||||||
html = dehyphenator(html,'txt_cleanup', length)
|
setattr(options, 'markup_chapter_headings', True)
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
setattr(options, 'italicize_common_cases', True)
|
||||||
|
setattr(options, 'fix_indents', True)
|
||||||
|
setattr(options, 'delete_blank_paragraphs', True)
|
||||||
|
setattr(options, 'format_scene_breaks', True)
|
||||||
|
setattr(options, 'dehyphenate', True)
|
||||||
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
html_input = plugin_for_input_format('html')
|
html_input = plugin_for_input_format('html')
|
||||||
|
@ -51,12 +51,12 @@ class TXTOutput(OutputFormatPlugin):
|
|||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Do not remove links within the document. This is only ' \
|
help=_('Do not remove links within the document. This is only ' \
|
||||||
'useful when paired with the markdown-format option because' \
|
'useful when paired with the markdown-format option because' \
|
||||||
'links are always removed with plain text output.')),
|
' links are always removed with plain text output.')),
|
||||||
OptionRecommendation(name='keep_image_references',
|
OptionRecommendation(name='keep_image_references',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
help=_('Do not remove image references within the document. This is only ' \
|
help=_('Do not remove image references within the document. This is only ' \
|
||||||
'useful when paired with the markdown-format option because' \
|
'useful when paired with the markdown-format option because' \
|
||||||
'image references are always removed with plain text output.')),
|
' image references are always removed with plain text output.')),
|
||||||
])
|
])
|
||||||
|
|
||||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||||
|
@ -12,7 +12,6 @@ import os, re
|
|||||||
|
|
||||||
from calibre import prepare_string_for_xml, isbytestring
|
from calibre import prepare_string_for_xml, isbytestring
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
|
||||||
@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
|
|
||||||
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
||||||
|
|
||||||
def convert_heuristic(txt, title='', epub_split_size_kb=0):
|
|
||||||
tp = TXTHeuristicProcessor()
|
|
||||||
return tp.convert(txt, title, epub_split_size_kb)
|
|
||||||
|
|
||||||
def convert_markdown(txt, title='', disable_toc=False):
|
def convert_markdown(txt, title='', disable_toc=False):
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
|
@ -11,6 +11,8 @@ from calibre.gui2.convert.single import Config, sort_formats_by_preference, \
|
|||||||
from calibre.customize.ui import available_output_formats
|
from calibre.customize.ui import available_output_formats
|
||||||
from calibre.gui2 import ResizableDialog
|
from calibre.gui2 import ResizableDialog
|
||||||
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
||||||
|
from calibre.gui2.convert.heuristics import HeuristicsWidget
|
||||||
|
from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
|
||||||
from calibre.gui2.convert.page_setup import PageSetupWidget
|
from calibre.gui2.convert.page_setup import PageSetupWidget
|
||||||
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
||||||
from calibre.gui2.convert.toc import TOCWidget
|
from calibre.gui2.convert.toc import TOCWidget
|
||||||
@ -69,6 +71,8 @@ class BulkConfig(Config):
|
|||||||
|
|
||||||
self.setWindowTitle(_('Bulk Convert'))
|
self.setWindowTitle(_('Bulk Convert'))
|
||||||
lf = widget_factory(LookAndFeelWidget)
|
lf = widget_factory(LookAndFeelWidget)
|
||||||
|
hw = widget_factory(HeuristicsWidget)
|
||||||
|
sr = widget_factory(SearchAndReplaceWidget)
|
||||||
ps = widget_factory(PageSetupWidget)
|
ps = widget_factory(PageSetupWidget)
|
||||||
sd = widget_factory(StructureDetectionWidget)
|
sd = widget_factory(StructureDetectionWidget)
|
||||||
toc = widget_factory(TOCWidget)
|
toc = widget_factory(TOCWidget)
|
||||||
@ -90,7 +94,7 @@ class BulkConfig(Config):
|
|||||||
if not c: break
|
if not c: break
|
||||||
self.stack.removeWidget(c)
|
self.stack.removeWidget(c)
|
||||||
|
|
||||||
widgets = [lf, ps, sd, toc]
|
widgets = [lf, hw, ps, sd, toc, sr]
|
||||||
if output_widget is not None:
|
if output_widget is not None:
|
||||||
widgets.append(output_widget)
|
widgets.append(output_widget)
|
||||||
for w in widgets:
|
for w in widgets:
|
||||||
|
58
src/calibre/gui2/convert/heuristics.py
Normal file
58
src/calibre/gui2/convert/heuristics.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from PyQt4.Qt import Qt
|
||||||
|
|
||||||
|
from calibre.gui2.convert.heuristics_ui import Ui_Form
|
||||||
|
from calibre.gui2.convert import Widget
|
||||||
|
|
||||||
|
class HeuristicsWidget(Widget, Ui_Form):
|
||||||
|
|
||||||
|
TITLE = _('Heuristic\nProcessing')
|
||||||
|
HELP = _('Modify the document text and structure using common patterns.')
|
||||||
|
COMMIT_NAME = 'heuristics'
|
||||||
|
ICON = I('heuristics.png')
|
||||||
|
|
||||||
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
|
Widget.__init__(self, parent,
|
||||||
|
['enable_heuristics', 'markup_chapter_headings',
|
||||||
|
'italicize_common_cases', 'fix_indents',
|
||||||
|
'html_unwrap_factor', 'unwrap_lines',
|
||||||
|
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||||
|
'dehyphenate', 'renumber_headings']
|
||||||
|
)
|
||||||
|
self.db, self.book_id = db, book_id
|
||||||
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
|
self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
|
||||||
|
self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
|
||||||
|
|
||||||
|
self.enable_heuristics(self.opt_enable_heuristics.checkState())
|
||||||
|
|
||||||
|
def break_cycles(self):
|
||||||
|
Widget.break_cycles(self)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.opt_enable_heuristics.stateChanged.disconnect()
|
||||||
|
self.opt_unwrap_lines.stateChanged.disconnect()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def set_value_handler(self, g, val):
|
||||||
|
if val is None and g is self.opt_html_unwrap_factor:
|
||||||
|
g.setValue(0.0)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def enable_heuristics(self, state):
|
||||||
|
state = state == Qt.Checked
|
||||||
|
self.heuristic_options.setEnabled(state)
|
||||||
|
|
||||||
|
def enable_unwrap(self, state):
|
||||||
|
if state == Qt.Checked:
|
||||||
|
state = True
|
||||||
|
else:
|
||||||
|
state = False
|
||||||
|
self.opt_html_unwrap_factor.setEnabled(state)
|
227
src/calibre/gui2/convert/heuristics.ui
Normal file
227
src/calibre/gui2/convert/heuristics.ui
Normal file
@ -0,0 +1,227 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<ui version="4.0">
|
||||||
|
<class>Form</class>
|
||||||
|
<widget class="QWidget" name="Form">
|
||||||
|
<property name="geometry">
|
||||||
|
<rect>
|
||||||
|
<x>0</x>
|
||||||
|
<y>0</y>
|
||||||
|
<width>724</width>
|
||||||
|
<height>470</height>
|
||||||
|
</rect>
|
||||||
|
</property>
|
||||||
|
<property name="windowTitle">
|
||||||
|
<string>Form</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QVBoxLayout" name="verticalLayout">
|
||||||
|
<item>
|
||||||
|
<widget class="QLabel" name="label">
|
||||||
|
<property name="text">
|
||||||
|
<string><b>Heuristic processing</b> means that calibre will scan your book for common patterns and fix them. As the name implies, this involves guesswork, which means that it could end up worsening the result of a conversion, if calibre guesses wrong. Therefore, it is disabled by default. Often, if a conversion does not turn out as you expect, turning on heuristics can improve matters. Read more about the various heuristic processing options in the <a href="http://calibre-ebook.com/user_manual/conversion.html#heuristic-processing">User Manual</a>.</string>
|
||||||
|
</property>
|
||||||
|
<property name="wordWrap">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
<property name="openExternalLinks">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<spacer name="verticalSpacer_2">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Vertical</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeType">
|
||||||
|
<enum>QSizePolicy::Fixed</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>20</width>
|
||||||
|
<height>15</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_enable_heuristics">
|
||||||
|
<property name="text">
|
||||||
|
<string>Enable &heuristic processing</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QGroupBox" name="heuristic_options">
|
||||||
|
<property name="title">
|
||||||
|
<string>Heuristic Processing</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QVBoxLayout" name="verticalLayout_2">
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_unwrap_lines">
|
||||||
|
<property name="text">
|
||||||
|
<string>Unwrap lines</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<layout class="QHBoxLayout" name="horizontalLayout">
|
||||||
|
<item>
|
||||||
|
<spacer name="horizontalSpacer">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Horizontal</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeType">
|
||||||
|
<enum>QSizePolicy::Fixed</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>40</width>
|
||||||
|
<height>20</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QLabel" name="huf_label">
|
||||||
|
<property name="text">
|
||||||
|
<string>Line &un-wrap factor :</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_html_unwrap_factor</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
|
||||||
|
<property name="toolTip">
|
||||||
|
<string/>
|
||||||
|
</property>
|
||||||
|
<property name="maximum">
|
||||||
|
<double>1.000000000000000</double>
|
||||||
|
</property>
|
||||||
|
<property name="singleStep">
|
||||||
|
<double>0.050000000000000</double>
|
||||||
|
</property>
|
||||||
|
<property name="value">
|
||||||
|
<double>0.400000000000000</double>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<spacer name="horizontalSpacer_2">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Horizontal</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>40</width>
|
||||||
|
<height>20</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_markup_chapter_headings">
|
||||||
|
<property name="text">
|
||||||
|
<string>Detect and markup unformatted chapter headings and sub headings</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_renumber_headings">
|
||||||
|
<property name="text">
|
||||||
|
<string>Renumber sequences of <h1> or <h2> tags to prevent splitting</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_delete_blank_paragraphs">
|
||||||
|
<property name="text">
|
||||||
|
<string>Delete blank lines between paragraphs</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_format_scene_breaks">
|
||||||
|
<property name="text">
|
||||||
|
<string>Ensure scene breaks are consistently formatted</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_dehyphenate">
|
||||||
|
<property name="text">
|
||||||
|
<string>Remove unnecessary hyphens</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_italicize_common_cases">
|
||||||
|
<property name="text">
|
||||||
|
<string>Italicize common words and patterns</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_fix_indents">
|
||||||
|
<property name="text">
|
||||||
|
<string>Replace entity indents with CSS indents</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<spacer name="verticalSpacer">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Vertical</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>131</width>
|
||||||
|
<height>35</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
<resources/>
|
||||||
|
<connections>
|
||||||
|
<connection>
|
||||||
|
<sender>opt_enable_heuristics</sender>
|
||||||
|
<signal>toggled(bool)</signal>
|
||||||
|
<receiver>opt_html_unwrap_factor</receiver>
|
||||||
|
<slot>setEnabled(bool)</slot>
|
||||||
|
<hints>
|
||||||
|
<hint type="sourcelabel">
|
||||||
|
<x>328</x>
|
||||||
|
<y>87</y>
|
||||||
|
</hint>
|
||||||
|
<hint type="destinationlabel">
|
||||||
|
<x>481</x>
|
||||||
|
<y>113</y>
|
||||||
|
</hint>
|
||||||
|
</hints>
|
||||||
|
</connection>
|
||||||
|
<connection>
|
||||||
|
<sender>opt_enable_heuristics</sender>
|
||||||
|
<signal>toggled(bool)</signal>
|
||||||
|
<receiver>huf_label</receiver>
|
||||||
|
<slot>setEnabled(bool)</slot>
|
||||||
|
<hints>
|
||||||
|
<hint type="sourcelabel">
|
||||||
|
<x>295</x>
|
||||||
|
<y>88</y>
|
||||||
|
</hint>
|
||||||
|
<hint type="destinationlabel">
|
||||||
|
<x>291</x>
|
||||||
|
<y>105</y>
|
||||||
|
</hint>
|
||||||
|
</hints>
|
||||||
|
</connection>
|
||||||
|
</connections>
|
||||||
|
</ui>
|
@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from calibre.gui2.convert.pdb_output_ui import Ui_Form
|
from calibre.gui2.convert.pdb_output_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.ebooks.pdb import FORMAT_WRITERS
|
|
||||||
from calibre.gui2.widgets import BasicComboModel
|
|
||||||
|
|
||||||
format_model = None
|
format_model = None
|
||||||
|
|
||||||
@ -21,17 +19,8 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
|
Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
|
|
||||||
|
for x in get_option('format').option.choices:
|
||||||
|
self.opt_format.addItem(x)
|
||||||
|
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
default = self.opt_format.currentText()
|
|
||||||
|
|
||||||
global format_model
|
|
||||||
if format_model is None:
|
|
||||||
format_model = BasicComboModel(FORMAT_WRITERS.keys())
|
|
||||||
self.format_model = format_model
|
|
||||||
self.opt_format.setModel(self.format_model)
|
|
||||||
|
|
||||||
default_index = self.opt_format.findText(default)
|
|
||||||
format_index = self.opt_format.findText('doc')
|
|
||||||
self.opt_format.setCurrentIndex(default_index if default_index != -1 else format_index if format_index != -1 else 0)
|
|
||||||
|
|
||||||
|
@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from calibre.gui2.convert.pdf_output_ui import Ui_Form
|
from calibre.gui2.convert.pdf_output_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.ebooks.pdf.pageoptions import PAPER_SIZES, ORIENTATIONS
|
|
||||||
from calibre.gui2.widgets import BasicComboModel
|
|
||||||
|
|
||||||
paper_size_model = None
|
paper_size_model = None
|
||||||
orientation_model = None
|
orientation_model = None
|
||||||
@ -23,28 +21,11 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
Widget.__init__(self, parent, ['paper_size',
|
Widget.__init__(self, parent, ['paper_size',
|
||||||
'orientation', 'preserve_cover_aspect_ratio'])
|
'orientation', 'preserve_cover_aspect_ratio'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
|
|
||||||
|
for x in get_option('paper_size').option.choices:
|
||||||
|
self.opt_paper_size.addItem(x)
|
||||||
|
for x in get_option('orientation').option.choices:
|
||||||
|
self.opt_orientation.addItem(x)
|
||||||
|
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
default_paper_size = self.opt_paper_size.currentText()
|
|
||||||
default_orientation = self.opt_orientation.currentText()
|
|
||||||
|
|
||||||
global paper_size_model
|
|
||||||
if paper_size_model is None:
|
|
||||||
paper_size_model = BasicComboModel(PAPER_SIZES.keys())
|
|
||||||
self.paper_size_model = paper_size_model
|
|
||||||
self.opt_paper_size.setModel(self.paper_size_model)
|
|
||||||
|
|
||||||
default_paper_size_index = self.opt_paper_size.findText(default_paper_size)
|
|
||||||
letter_index = self.opt_paper_size.findText('letter')
|
|
||||||
self.opt_paper_size.setCurrentIndex(default_paper_size_index if default_paper_size_index != -1 else letter_index if letter_index != -1 else 0)
|
|
||||||
|
|
||||||
global orientation_model
|
|
||||||
if orientation_model is None:
|
|
||||||
orientation_model = BasicComboModel(ORIENTATIONS.keys())
|
|
||||||
self.orientation_model = orientation_model
|
|
||||||
self.opt_orientation.setModel(self.orientation_model)
|
|
||||||
|
|
||||||
default_orientation_index = self.opt_orientation.findText(default_orientation)
|
|
||||||
orientation_index = self.opt_orientation.findText('portrait')
|
|
||||||
self.opt_orientation.setCurrentIndex(default_orientation_index if default_orientation_index != -1 else orientation_index if orientation_index != -1 else 0)
|
|
||||||
|
|
||||||
|
55
src/calibre/gui2/convert/search_and_replace.py
Normal file
55
src/calibre/gui2/convert/search_and_replace.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.gui2.convert.search_and_replace_ui import Ui_Form
|
||||||
|
from calibre.gui2.convert import Widget
|
||||||
|
from calibre.gui2 import error_dialog
|
||||||
|
|
||||||
|
class SearchAndReplaceWidget(Widget, Ui_Form):
|
||||||
|
|
||||||
|
TITLE = _('Search\n&\nReplace')
|
||||||
|
HELP = _('Modify the document text and structure using user defined patterns.')
|
||||||
|
COMMIT_NAME = 'search_and_replace'
|
||||||
|
ICON = I('search.png')
|
||||||
|
|
||||||
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
|
Widget.__init__(self, parent,
|
||||||
|
['sr1_search', 'sr1_replace',
|
||||||
|
'sr2_search', 'sr2_replace',
|
||||||
|
'sr3_search', 'sr3_replace']
|
||||||
|
)
|
||||||
|
self.db, self.book_id = db, book_id
|
||||||
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
self.opt_sr1_search.set_msg(_('&Search Regular Expression'))
|
||||||
|
self.opt_sr1_search.set_book_id(book_id)
|
||||||
|
self.opt_sr1_search.set_db(db)
|
||||||
|
self.opt_sr2_search.set_msg(_('&Search Regular Expression'))
|
||||||
|
self.opt_sr2_search.set_book_id(book_id)
|
||||||
|
self.opt_sr2_search.set_db(db)
|
||||||
|
self.opt_sr3_search.set_msg(_('&Search Regular Expression'))
|
||||||
|
self.opt_sr3_search.set_book_id(book_id)
|
||||||
|
self.opt_sr3_search.set_db(db)
|
||||||
|
|
||||||
|
def break_cycles(self):
|
||||||
|
Widget.break_cycles(self)
|
||||||
|
|
||||||
|
self.opt_sr1_search.break_cycles()
|
||||||
|
self.opt_sr2_search.break_cycles()
|
||||||
|
self.opt_sr3_search.break_cycles()
|
||||||
|
|
||||||
|
def pre_commit_check(self):
|
||||||
|
for x in ('sr1_search', 'sr2_search', 'sr3_search'):
|
||||||
|
x = getattr(self, 'opt_'+x)
|
||||||
|
try:
|
||||||
|
pat = unicode(x.regex)
|
||||||
|
re.compile(pat)
|
||||||
|
except Exception, err:
|
||||||
|
error_dialog(self, _('Invalid regular expression'),
|
||||||
|
_('Invalid regular expression: %s')%err, show=True)
|
||||||
|
return False
|
||||||
|
return True
|
213
src/calibre/gui2/convert/search_and_replace.ui
Normal file
213
src/calibre/gui2/convert/search_and_replace.ui
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<ui version="4.0">
|
||||||
|
<class>Form</class>
|
||||||
|
<widget class="QWidget" name="Form">
|
||||||
|
<property name="geometry">
|
||||||
|
<rect>
|
||||||
|
<x>0</x>
|
||||||
|
<y>0</y>
|
||||||
|
<width>468</width>
|
||||||
|
<height>451</height>
|
||||||
|
</rect>
|
||||||
|
</property>
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="windowTitle">
|
||||||
|
<string>Form</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout_4">
|
||||||
|
<property name="sizeConstraint">
|
||||||
|
<enum>QLayout::SetDefaultConstraint</enum>
|
||||||
|
</property>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QGroupBox" name="groupBox">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="title">
|
||||||
|
<string>First expression</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout_2">
|
||||||
|
<property name="sizeConstraint">
|
||||||
|
<enum>QLayout::SetMinimumSize</enum>
|
||||||
|
</property>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="RegexEdit" name="opt_sr1_search" native="true">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label_4">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="text">
|
||||||
|
<string>&Replacement Text</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_sr1_replace</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QLineEdit" name="opt_sr1_replace">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QGroupBox" name="groupBox_2">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="title">
|
||||||
|
<string>Second Expression</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
|
<property name="sizeConstraint">
|
||||||
|
<enum>QLayout::SetMinimumSize</enum>
|
||||||
|
</property>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="RegexEdit" name="opt_sr2_search" native="true">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label_5">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="text">
|
||||||
|
<string>&Replacement Text</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_sr2_replace</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QLineEdit" name="opt_sr2_replace">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="3" column="0">
|
||||||
|
<widget class="QGroupBox" name="groupBox_3">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="title">
|
||||||
|
<string>Third expression</string>
|
||||||
|
</property>
|
||||||
|
<layout class="QGridLayout" name="gridLayout_3">
|
||||||
|
<property name="sizeConstraint">
|
||||||
|
<enum>QLayout::SetMinimumSize</enum>
|
||||||
|
</property>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="RegexEdit" name="opt_sr3_search" native="true">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label_6">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
<property name="text">
|
||||||
|
<string>&Replacement Text</string>
|
||||||
|
</property>
|
||||||
|
<property name="buddy">
|
||||||
|
<cstring>opt_sr3_replace</cstring>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="0">
|
||||||
|
<widget class="QLineEdit" name="opt_sr3_replace">
|
||||||
|
<property name="sizePolicy">
|
||||||
|
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
|
||||||
|
<horstretch>0</horstretch>
|
||||||
|
<verstretch>0</verstretch>
|
||||||
|
</sizepolicy>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="0" column="0">
|
||||||
|
<widget class="QLabel" name="label">
|
||||||
|
<property name="text">
|
||||||
|
<string><p>Search and replace uses <i>regular expressions</i>. See the <a href="http://calibre-ebook.com/user_manual/regexp.html">regular expressions tutorial</a> to get started with regular expressions. Also clicking the wizard buttons below will allow you to test your regular expression against the current input document.</string>
|
||||||
|
</property>
|
||||||
|
<property name="wordWrap">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
<property name="openExternalLinks">
|
||||||
|
<bool>true</bool>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
</layout>
|
||||||
|
</widget>
|
||||||
|
<customwidgets>
|
||||||
|
<customwidget>
|
||||||
|
<class>RegexEdit</class>
|
||||||
|
<extends>QWidget</extends>
|
||||||
|
<header>regex_builder.h</header>
|
||||||
|
<container>1</container>
|
||||||
|
</customwidget>
|
||||||
|
</customwidgets>
|
||||||
|
<resources/>
|
||||||
|
<connections/>
|
||||||
|
</ui>
|
@ -16,6 +16,8 @@ from calibre.ebooks.conversion.config import GuiRecommendations, save_specifics,
|
|||||||
from calibre.gui2.convert.single_ui import Ui_Dialog
|
from calibre.gui2.convert.single_ui import Ui_Dialog
|
||||||
from calibre.gui2.convert.metadata import MetadataWidget
|
from calibre.gui2.convert.metadata import MetadataWidget
|
||||||
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
||||||
|
from calibre.gui2.convert.heuristics import HeuristicsWidget
|
||||||
|
from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
|
||||||
from calibre.gui2.convert.page_setup import PageSetupWidget
|
from calibre.gui2.convert.page_setup import PageSetupWidget
|
||||||
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
||||||
from calibre.gui2.convert.toc import TOCWidget
|
from calibre.gui2.convert.toc import TOCWidget
|
||||||
@ -170,6 +172,8 @@ class Config(ResizableDialog, Ui_Dialog):
|
|||||||
self.mw = widget_factory(MetadataWidget)
|
self.mw = widget_factory(MetadataWidget)
|
||||||
self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
|
self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
|
||||||
lf = widget_factory(LookAndFeelWidget)
|
lf = widget_factory(LookAndFeelWidget)
|
||||||
|
hw = widget_factory(HeuristicsWidget)
|
||||||
|
sr = widget_factory(SearchAndReplaceWidget)
|
||||||
ps = widget_factory(PageSetupWidget)
|
ps = widget_factory(PageSetupWidget)
|
||||||
sd = widget_factory(StructureDetectionWidget)
|
sd = widget_factory(StructureDetectionWidget)
|
||||||
toc = widget_factory(TOCWidget)
|
toc = widget_factory(TOCWidget)
|
||||||
@ -203,7 +207,7 @@ class Config(ResizableDialog, Ui_Dialog):
|
|||||||
if not c: break
|
if not c: break
|
||||||
self.stack.removeWidget(c)
|
self.stack.removeWidget(c)
|
||||||
|
|
||||||
widgets = [self.mw, lf, ps, sd, toc]
|
widgets = [self.mw, lf, hw, ps, sd, toc, sr]
|
||||||
if input_widget is not None:
|
if input_widget is not None:
|
||||||
widgets.append(input_widget)
|
widgets.append(input_widget)
|
||||||
if output_widget is not None:
|
if output_widget is not None:
|
||||||
|
@ -100,7 +100,7 @@
|
|||||||
</size>
|
</size>
|
||||||
</property>
|
</property>
|
||||||
<property name="spacing">
|
<property name="spacing">
|
||||||
<number>20</number>
|
<number>10</number>
|
||||||
</property>
|
</property>
|
||||||
<property name="wordWrap">
|
<property name="wordWrap">
|
||||||
<bool>true</bool>
|
<bool>true</bool>
|
||||||
@ -129,8 +129,8 @@
|
|||||||
<rect>
|
<rect>
|
||||||
<x>0</x>
|
<x>0</x>
|
||||||
<y>0</y>
|
<y>0</y>
|
||||||
<width>805</width>
|
<width>810</width>
|
||||||
<height>484</height>
|
<height>494</height>
|
||||||
</rect>
|
</rect>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QVBoxLayout" name="verticalLayout_3">
|
<layout class="QVBoxLayout" name="verticalLayout_3">
|
||||||
|
@ -6,8 +6,6 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre.gui2.convert.structure_detection_ui import Ui_Form
|
from calibre.gui2.convert.structure_detection_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog
|
||||||
@ -24,12 +22,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['chapter', 'chapter_mark',
|
['chapter', 'chapter_mark',
|
||||||
'remove_first_image',
|
'remove_first_image',
|
||||||
'insert_metadata', 'page_breaks_before',
|
'insert_metadata', 'page_breaks_before']
|
||||||
'preprocess_html', 'remove_header', 'header_regex',
|
|
||||||
'remove_footer', 'footer_regex','html_unwrap_factor']
|
|
||||||
)
|
)
|
||||||
self.opt_html_unwrap_factor.setEnabled(False)
|
|
||||||
self.huf_label.setEnabled(False)
|
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
for x in ('pagebreak', 'rule', 'both', 'none'):
|
for x in ('pagebreak', 'rule', 'both', 'none'):
|
||||||
self.opt_chapter_mark.addItem(x)
|
self.opt_chapter_mark.addItem(x)
|
||||||
@ -37,28 +31,11 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
self.opt_chapter.set_msg(_('Detect chapters at (XPath expression):'))
|
self.opt_chapter.set_msg(_('Detect chapters at (XPath expression):'))
|
||||||
self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
|
self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
|
||||||
'(XPath expression):'))
|
'(XPath expression):'))
|
||||||
self.opt_header_regex.set_msg(_('Header regular expression:'))
|
|
||||||
self.opt_header_regex.set_book_id(book_id)
|
|
||||||
self.opt_header_regex.set_db(db)
|
|
||||||
self.opt_footer_regex.set_msg(_('Footer regular expression:'))
|
|
||||||
self.opt_footer_regex.set_book_id(book_id)
|
|
||||||
self.opt_footer_regex.set_db(db)
|
|
||||||
|
|
||||||
def break_cycles(self):
|
def break_cycles(self):
|
||||||
Widget.break_cycles(self)
|
Widget.break_cycles(self)
|
||||||
self.opt_header_regex.break_cycles()
|
|
||||||
self.opt_footer_regex.break_cycles()
|
|
||||||
|
|
||||||
def pre_commit_check(self):
|
def pre_commit_check(self):
|
||||||
for x in ('header_regex', 'footer_regex'):
|
|
||||||
x = getattr(self, 'opt_'+x)
|
|
||||||
try:
|
|
||||||
pat = unicode(x.regex)
|
|
||||||
re.compile(pat)
|
|
||||||
except Exception, err:
|
|
||||||
error_dialog(self, _('Invalid regular expression'),
|
|
||||||
_('Invalid regular expression: %s')%err).exec_()
|
|
||||||
return False
|
|
||||||
for x in ('chapter', 'page_breaks_before'):
|
for x in ('chapter', 'page_breaks_before'):
|
||||||
x = getattr(self, 'opt_'+x)
|
x = getattr(self, 'opt_'+x)
|
||||||
if not x.check():
|
if not x.check():
|
||||||
@ -66,8 +43,3 @@ class StructureDetectionWidget(Widget, Ui_Form):
|
|||||||
_('The XPath expression %s is invalid.')%x.text).exec_()
|
_('The XPath expression %s is invalid.')%x.text).exec_()
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def set_value_handler(self, g, val):
|
|
||||||
if val is None and g is self.opt_html_unwrap_factor:
|
|
||||||
g.setValue(0.0)
|
|
||||||
return True
|
|
||||||
|
@ -14,10 +14,10 @@
|
|||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QGridLayout" name="gridLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
<item row="0" column="1" colspan="2">
|
<item row="0" column="0" colspan="3">
|
||||||
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
<widget class="XPathEdit" name="opt_chapter" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="1" column="0" colspan="2">
|
<item row="1" column="0">
|
||||||
<widget class="QLabel" name="label">
|
<widget class="QLabel" name="label">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Chapter &mark:</string>
|
<string>Chapter &mark:</string>
|
||||||
@ -27,7 +27,7 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="1" column="2">
|
<item row="1" column="1">
|
||||||
<widget class="QComboBox" name="opt_chapter_mark">
|
<widget class="QComboBox" name="opt_chapter_mark">
|
||||||
<property name="minimumContentsLength">
|
<property name="minimumContentsLength">
|
||||||
<number>20</number>
|
<number>20</number>
|
||||||
@ -41,17 +41,17 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="5" column="0" colspan="2">
|
<item row="3" column="0" colspan="2">
|
||||||
<widget class="QCheckBox" name="opt_insert_metadata">
|
<widget class="QCheckBox" name="opt_insert_metadata">
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>Insert &metadata as page at start of book</string>
|
<string>Insert &metadata as page at start of book</string>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="11" column="0" colspan="3">
|
<item row="5" column="0" colspan="3">
|
||||||
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
<widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item row="12" column="0" colspan="3">
|
<item row="6" column="0" colspan="3">
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Vertical</enum>
|
<enum>Qt::Vertical</enum>
|
||||||
@ -64,53 +64,7 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
<item row="8" column="0" colspan="2">
|
<item row="1" column="2">
|
||||||
<widget class="QCheckBox" name="opt_remove_footer">
|
|
||||||
<property name="text">
|
|
||||||
<string>Remove F&ooter</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="6" column="0" colspan="2">
|
|
||||||
<widget class="QCheckBox" name="opt_remove_header">
|
|
||||||
<property name="text">
|
|
||||||
<string>Remove H&eader</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="7" column="0" colspan="3">
|
|
||||||
<widget class="RegexEdit" name="opt_header_regex" native="true"/>
|
|
||||||
</item>
|
|
||||||
<item row="9" column="0" colspan="3">
|
|
||||||
<widget class="RegexEdit" name="opt_footer_regex" native="true"/>
|
|
||||||
</item>
|
|
||||||
<item row="4" column="1">
|
|
||||||
<widget class="QLabel" name="huf_label">
|
|
||||||
<property name="text">
|
|
||||||
<string>Line &un-wrap factor during preprocess:</string>
|
|
||||||
</property>
|
|
||||||
<property name="buddy">
|
|
||||||
<cstring>opt_html_unwrap_factor</cstring>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="4" column="2">
|
|
||||||
<widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
|
|
||||||
<property name="toolTip">
|
|
||||||
<string/>
|
|
||||||
</property>
|
|
||||||
<property name="maximum">
|
|
||||||
<double>1.000000000000000</double>
|
|
||||||
</property>
|
|
||||||
<property name="singleStep">
|
|
||||||
<double>0.050000000000000</double>
|
|
||||||
</property>
|
|
||||||
<property name="value">
|
|
||||||
<double>0.400000000000000</double>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
<item row="4" column="0">
|
|
||||||
<spacer name="horizontalSpacer">
|
<spacer name="horizontalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
<enum>Qt::Horizontal</enum>
|
<enum>Qt::Horizontal</enum>
|
||||||
@ -123,13 +77,6 @@
|
|||||||
</property>
|
</property>
|
||||||
</spacer>
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
<item row="3" column="0" colspan="2">
|
|
||||||
<widget class="QCheckBox" name="opt_preprocess_html">
|
|
||||||
<property name="text">
|
|
||||||
<string>&Preprocess input file to possibly improve structure detection</string>
|
|
||||||
</property>
|
|
||||||
</widget>
|
|
||||||
</item>
|
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<customwidgets>
|
<customwidgets>
|
||||||
@ -139,46 +86,7 @@
|
|||||||
<header>convert/xpath_wizard.h</header>
|
<header>convert/xpath_wizard.h</header>
|
||||||
<container>1</container>
|
<container>1</container>
|
||||||
</customwidget>
|
</customwidget>
|
||||||
<customwidget>
|
|
||||||
<class>RegexEdit</class>
|
|
||||||
<extends>QWidget</extends>
|
|
||||||
<header>regex_builder.h</header>
|
|
||||||
<container>1</container>
|
|
||||||
</customwidget>
|
|
||||||
</customwidgets>
|
</customwidgets>
|
||||||
<resources/>
|
<resources/>
|
||||||
<connections>
|
<connections/>
|
||||||
<connection>
|
|
||||||
<sender>opt_preprocess_html</sender>
|
|
||||||
<signal>toggled(bool)</signal>
|
|
||||||
<receiver>opt_html_unwrap_factor</receiver>
|
|
||||||
<slot>setEnabled(bool)</slot>
|
|
||||||
<hints>
|
|
||||||
<hint type="sourcelabel">
|
|
||||||
<x>328</x>
|
|
||||||
<y>87</y>
|
|
||||||
</hint>
|
|
||||||
<hint type="destinationlabel">
|
|
||||||
<x>481</x>
|
|
||||||
<y>113</y>
|
|
||||||
</hint>
|
|
||||||
</hints>
|
|
||||||
</connection>
|
|
||||||
<connection>
|
|
||||||
<sender>opt_preprocess_html</sender>
|
|
||||||
<signal>toggled(bool)</signal>
|
|
||||||
<receiver>huf_label</receiver>
|
|
||||||
<slot>setEnabled(bool)</slot>
|
|
||||||
<hints>
|
|
||||||
<hint type="sourcelabel">
|
|
||||||
<x>295</x>
|
|
||||||
<y>88</y>
|
|
||||||
</hint>
|
|
||||||
<hint type="destinationlabel">
|
|
||||||
<x>291</x>
|
|
||||||
<y>105</y>
|
|
||||||
</hint>
|
|
||||||
</hints>
|
|
||||||
</connection>
|
|
||||||
</connections>
|
|
||||||
</ui>
|
</ui>
|
||||||
|
@ -4,10 +4,10 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
from PyQt4.Qt import Qt
|
||||||
|
|
||||||
from calibre.gui2.convert.txt_output_ui import Ui_Form
|
from calibre.gui2.convert.txt_output_ui import Ui_Form
|
||||||
from calibre.gui2.convert import Widget
|
from calibre.gui2.convert import Widget
|
||||||
from calibre.ebooks.txt.newlines import TxtNewlines
|
|
||||||
from calibre.gui2.widgets import BasicComboModel
|
|
||||||
|
|
||||||
newline_model = None
|
newline_model = None
|
||||||
|
|
||||||
@ -24,16 +24,23 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
|
'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
|
||||||
'txt_output_encoding'])
|
'txt_output_encoding'])
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
|
for x in get_option('newline').option.choices:
|
||||||
|
self.opt_newline.addItem(x)
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
default = self.opt_newline.currentText()
|
self.opt_markdown_format.stateChanged.connect(self.enable_markdown_format)
|
||||||
|
self.enable_markdown_format(self.opt_markdown_format.checkState())
|
||||||
|
|
||||||
global newline_model
|
def break_cycles(self):
|
||||||
if newline_model is None:
|
Widget.break_cycles(self)
|
||||||
newline_model = BasicComboModel(TxtNewlines.NEWLINE_TYPES.keys())
|
|
||||||
self.newline_model = newline_model
|
try:
|
||||||
self.opt_newline.setModel(self.newline_model)
|
self.opt_markdown_format.stateChanged.disconnect()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def enable_markdown_format(self, state):
|
||||||
|
state = state == Qt.Checked
|
||||||
|
self.opt_keep_links.setEnabled(state)
|
||||||
|
self.opt_keep_image_references.setEnabled(state)
|
||||||
|
|
||||||
default_index = self.opt_newline.findText(default)
|
|
||||||
system_index = self.opt_newline.findText('system')
|
|
||||||
self.opt_newline.setCurrentIndex(default_index if default_index != -1 else system_index if system_index != -1 else 0)
|
|
||||||
|
@ -6,8 +6,8 @@
|
|||||||
<rect>
|
<rect>
|
||||||
<x>0</x>
|
<x>0</x>
|
||||||
<y>0</y>
|
<y>0</y>
|
||||||
<width>422</width>
|
<width>430</width>
|
||||||
<height>64</height>
|
<height>74</height>
|
||||||
</rect>
|
</rect>
|
||||||
</property>
|
</property>
|
||||||
<property name="windowTitle">
|
<property name="windowTitle">
|
||||||
@ -53,7 +53,7 @@
|
|||||||
<item row="0" column="1">
|
<item row="0" column="1">
|
||||||
<widget class="QToolButton" name="button">
|
<widget class="QToolButton" name="button">
|
||||||
<property name="toolTip">
|
<property name="toolTip">
|
||||||
<string>Use a wizard to help construct the XPath expression</string>
|
<string>Use a wizard to help construct the Regular expression</string>
|
||||||
</property>
|
</property>
|
||||||
<property name="text">
|
<property name="text">
|
||||||
<string>...</string>
|
<string>...</string>
|
||||||
@ -70,19 +70,6 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
<item row="0" column="2">
|
|
||||||
<spacer name="horizontalSpacer">
|
|
||||||
<property name="orientation">
|
|
||||||
<enum>Qt::Horizontal</enum>
|
|
||||||
</property>
|
|
||||||
<property name="sizeHint" stdset="0">
|
|
||||||
<size>
|
|
||||||
<width>20</width>
|
|
||||||
<height>20</height>
|
|
||||||
</size>
|
|
||||||
</property>
|
|
||||||
</spacer>
|
|
||||||
</item>
|
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
<customwidgets>
|
<customwidgets>
|
||||||
|
@ -19,7 +19,7 @@ from calibre.devices.scanner import DeviceScanner
|
|||||||
from calibre.gui2 import config, error_dialog, Dispatcher, dynamic, \
|
from calibre.gui2 import config, error_dialog, Dispatcher, dynamic, \
|
||||||
warning_dialog, info_dialog, choose_dir
|
warning_dialog, info_dialog, choose_dir
|
||||||
from calibre.ebooks.metadata import authors_to_string
|
from calibre.ebooks.metadata import authors_to_string
|
||||||
from calibre import preferred_encoding, prints, force_unicode
|
from calibre import preferred_encoding, prints, force_unicode, as_unicode
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.devices.errors import FreeSpaceError
|
from calibre.devices.errors import FreeSpaceError
|
||||||
from calibre.devices.apple.driver import ITUNES_ASYNC
|
from calibre.devices.apple.driver import ITUNES_ASYNC
|
||||||
@ -68,13 +68,7 @@ class DeviceJob(BaseJob): # {{{
|
|||||||
if self._aborted:
|
if self._aborted:
|
||||||
return
|
return
|
||||||
self.failed = True
|
self.failed = True
|
||||||
try:
|
ex = as_unicode(err)
|
||||||
ex = unicode(err)
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
ex = str(err).decode(preferred_encoding, 'replace')
|
|
||||||
except:
|
|
||||||
ex = repr(err)
|
|
||||||
self._details = ex + '\n\n' + \
|
self._details = ex + '\n\n' + \
|
||||||
traceback.format_exc()
|
traceback.format_exc()
|
||||||
self.exception = err
|
self.exception = err
|
||||||
|
@ -12,6 +12,8 @@ from calibre.ebooks.conversion.plumber import Plumber
|
|||||||
from calibre.utils.logging import Log
|
from calibre.utils.logging import Log
|
||||||
from calibre.gui2.preferences.conversion_ui import Ui_Form
|
from calibre.gui2.preferences.conversion_ui import Ui_Form
|
||||||
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
|
||||||
|
from calibre.gui2.convert.heuristics import HeuristicsWidget
|
||||||
|
from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
|
||||||
from calibre.gui2.convert.page_setup import PageSetupWidget
|
from calibre.gui2.convert.page_setup import PageSetupWidget
|
||||||
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
from calibre.gui2.convert.structure_detection import StructureDetectionWidget
|
||||||
from calibre.gui2.convert.toc import TOCWidget
|
from calibre.gui2.convert.toc import TOCWidget
|
||||||
@ -82,8 +84,9 @@ class Base(ConfigWidgetBase, Ui_Form):
|
|||||||
class CommonOptions(Base):
|
class CommonOptions(Base):
|
||||||
|
|
||||||
def load_conversion_widgets(self):
|
def load_conversion_widgets(self):
|
||||||
self.conversion_widgets = [LookAndFeelWidget, PageSetupWidget,
|
self.conversion_widgets = [LookAndFeelWidget, HeuristicsWidget,
|
||||||
StructureDetectionWidget, TOCWidget]
|
PageSetupWidget,
|
||||||
|
StructureDetectionWidget, TOCWidget, SearchAndReplaceWidget,]
|
||||||
|
|
||||||
class InputOptions(Base):
|
class InputOptions(Base):
|
||||||
|
|
||||||
|
@ -311,32 +311,6 @@ class FontFamilyModel(QAbstractListModel):
|
|||||||
def index_of(self, family):
|
def index_of(self, family):
|
||||||
return self.families.index(family.strip())
|
return self.families.index(family.strip())
|
||||||
|
|
||||||
class BasicComboModel(QAbstractListModel):
|
|
||||||
|
|
||||||
def __init__(self, items, *args):
|
|
||||||
QAbstractListModel.__init__(self, *args)
|
|
||||||
self.items = [i for i in items]
|
|
||||||
self.items.sort()
|
|
||||||
|
|
||||||
def rowCount(self, *args):
|
|
||||||
return len(self.items)
|
|
||||||
|
|
||||||
def data(self, index, role):
|
|
||||||
try:
|
|
||||||
item = self.items[index.row()]
|
|
||||||
except:
|
|
||||||
traceback.print_exc()
|
|
||||||
return NONE
|
|
||||||
if role == Qt.DisplayRole:
|
|
||||||
return QVariant(item)
|
|
||||||
if role == Qt.FontRole:
|
|
||||||
return QVariant(QFont(item))
|
|
||||||
return NONE
|
|
||||||
|
|
||||||
def index_of(self, item):
|
|
||||||
return self.items.index(item.strip())
|
|
||||||
|
|
||||||
|
|
||||||
class BasicListItem(QListWidgetItem):
|
class BasicListItem(QListWidgetItem):
|
||||||
|
|
||||||
def __init__(self, text, user_data=None):
|
def __init__(self, text, user_data=None):
|
||||||
|
@ -255,6 +255,98 @@ you are producing are meant for a particular device type, choose the correspondi
|
|||||||
|
|
||||||
The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
|
The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
|
||||||
|
|
||||||
|
.. _heuristic-processing:
|
||||||
|
|
||||||
|
Heuristic Processing
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Heuristic Processing provides a variety of functions which can be used that try to detect and correct
|
||||||
|
common problems in poorly formatted input documents. Use these functions if your input document suffers
|
||||||
|
from bad formatting. Because these functions rely on common patterns, be aware that in some cases an
|
||||||
|
option may lead to worse results, so use with care. As an example, several of these options will
|
||||||
|
remove all non-breaking-space entities.
|
||||||
|
|
||||||
|
:guilabel:`Enable heuristic processing`
|
||||||
|
This option activates |app|'s Heuristic Processing stage of the conversion pipeline.
|
||||||
|
This must be enabled in order for various sub-functions to be applied
|
||||||
|
|
||||||
|
:guilabel:`Unwrap lines`
|
||||||
|
Enabling this option will cause |app| to attempt to detect and correct hard line breaks that exist
|
||||||
|
within a document using punctuation clues and line length. |app| will first attempt to detect whether
|
||||||
|
hard line breaks exist, if they do not appear to exist |app| will not attempt to unwrap lines. The
|
||||||
|
line-unwrap factor can be reduced if you want to 'force' |app| to unwrap lines.
|
||||||
|
|
||||||
|
:guilabel:`Line-unwrap factor`
|
||||||
|
This option controls the algorithm |app| uses to remove hard line breaks. For example, if the value of this
|
||||||
|
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
|
||||||
|
than the length of 40% of all lines in the document. If your document only has a few line breaks which need
|
||||||
|
correction, then this value should be reduced to somewhere between 0.1 and 0.2.
|
||||||
|
|
||||||
|
:guilabel:`Detect and markup unformatted chapter headings and sub headings`
|
||||||
|
If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
|
||||||
|
|app| can use this option to attempt detection them and surround them with heading tags. <h2> tags are used
|
||||||
|
for chapter headings; <h3> tags are used for any titles that are detected.
|
||||||
|
|
||||||
|
This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings
|
||||||
|
to correctly detect chapters and build a TOC. Adjust the XPath under Structure Detection if a TOC is not automatically
|
||||||
|
created. If there are no other headings used in the document then setting "//h:h2" under Structure Detection would
|
||||||
|
be the easiest way to create a TOC for the document.
|
||||||
|
|
||||||
|
The inserted headings are not formatted, to apply formatting use the :guilabel:`Extra CSS` option under
|
||||||
|
the Look and Feel conversion settings. For example, to center heading tags, use the following::
|
||||||
|
|
||||||
|
h2, h3 { text-align: center }
|
||||||
|
|
||||||
|
:guilabel:`Renumber sequences of <h1> or <h2> tags`
|
||||||
|
Some publishers format chapter headings using multiple <h1> or <h2> tags sequentially.
|
||||||
|
|app|'s default conversion settings will cause such titles to be split into two pieces. This option
|
||||||
|
will re-number the heading tags to prevent splitting.
|
||||||
|
|
||||||
|
:guilabel:`Delete blank lines between paragraphs`
|
||||||
|
This option will cause |app| to analyze blank lines included within the document. If every paragraph is interleaved
|
||||||
|
with a blank line, then |app| will remove all those blank paragraphs. Sequences of multiple blank lines will be
|
||||||
|
considered scene breaks and retained as a single paragraph. This option differs from the 'Remove Paragraph Spacing'
|
||||||
|
option under 'Look and Feel' in that it actually modifies the HTML content, while the other option modifies the document
|
||||||
|
styles. This option can also remove paragraphs which were inserted using |app|'s 'Insert blank line' option.
|
||||||
|
|
||||||
|
:guilabel:`Ensure scene breaks are consistently formatted`
|
||||||
|
With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.
|
||||||
|
It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
|
||||||
|
page width. Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and
|
||||||
|
thus become difficult to distinguish.
|
||||||
|
|
||||||
|
:guilabel:`Remove unnecessary hyphens`
|
||||||
|
|app| will analyze all hyphenated content in the document when this option is enabled. The document itself is used
|
||||||
|
as a dictionary for analysis. This allows |app| to accurately remove hyphens for any words in the document in any language,
|
||||||
|
along with made-up and obscure scientific words. The primary drawback is words appearing only a single time in the document
|
||||||
|
will not be changed. Analysis happens in two passes, the first pass analyzes line endings. Lines are only unwrapped if the
|
||||||
|
word exists with or without a hyphen in the document. The second pass analyzes all hyphenated words throughout the document,
|
||||||
|
hyphens are removed if the word exists elsewhere in the document without a match.
|
||||||
|
|
||||||
|
:guilabel:`Italicize common words and patterns`
|
||||||
|
When enabled, |app| will look for common words and patterns that denote italics and italicize them. Examples are common text
|
||||||
|
conventions such as ~word~ or phrases that should generally be italicized, e.g. latin phrases like 'etc.' or 'et cetera'.
|
||||||
|
|
||||||
|
:guilabel:`Replace entity indents with CSS indents`
|
||||||
|
Some documents use a convention of defining text indents using non-breaking space entities. When this option is enabled |app| will
|
||||||
|
attempt to detect this sort of formatting and convert them to a 3% text indent using css.
|
||||||
|
|
||||||
|
.. search-replace:
|
||||||
|
|
||||||
|
Search & Replace
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
|
||||||
|
behind page headers and footers in the text. These options use regular expressions to try and detect
|
||||||
|
the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
|
||||||
|
by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
|
||||||
|
your document. These options can also be used for generic search and replace of any content by additionally
|
||||||
|
specifying a replacement expression.
|
||||||
|
|
||||||
|
The search works by using a python regular expression. All matched text is simply removed from
|
||||||
|
the document or replaced using the replacement pattern. You can learn more about regular expressions and
|
||||||
|
their syntax at :ref:`regexptutorial`.
|
||||||
|
|
||||||
.. _structure-detection:
|
.. _structure-detection:
|
||||||
|
|
||||||
Structure Detection
|
Structure Detection
|
||||||
@ -298,21 +390,6 @@ which means that |app| will insert page breaks before every `<h1>` and `<h2>` ta
|
|||||||
|
|
||||||
The default expressions may change depending on the input format you are converting.
|
The default expressions may change depending on the input format you are converting.
|
||||||
|
|
||||||
Removing headers and footers
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
|
|
||||||
behind page headers and footers in the text. These options use regular expressions to try and detect
|
|
||||||
the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
|
|
||||||
by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
|
|
||||||
your document.
|
|
||||||
|
|
||||||
The header and footer regular expressions are used in conjunction with the remove header and footer options.
|
|
||||||
If the remove option is not enabled the regular expression will not be applied to remove the matched text.
|
|
||||||
The removal works by using a python regular expression. All matched text is simply removed from
|
|
||||||
the document. You can learn more about regular expressions and their syntax at
|
|
||||||
http://docs.python.org/library/re.html.
|
|
||||||
|
|
||||||
Miscellaneous
|
Miscellaneous
|
||||||
~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -330,16 +407,6 @@ There are a few more options in this section.
|
|||||||
two covers. This option will simply remove the first image from the source document, thereby
|
two covers. This option will simply remove the first image from the source document, thereby
|
||||||
ensuring that the converted book has only one cover, the one specified in |app|.
|
ensuring that the converted book has only one cover, the one specified in |app|.
|
||||||
|
|
||||||
:guilabel:`Preprocess input`
|
|
||||||
This option activates various algorithms that try to detect and correct common cases of
|
|
||||||
badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
|
|
||||||
Turn this option on if your input document suffers from bad formatting. But be aware that in
|
|
||||||
some cases, this option can lead to worse results, so use with care.
|
|
||||||
|
|
||||||
:guilabel:`Line-unwrap factor`
|
|
||||||
This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
|
|
||||||
option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
|
|
||||||
than the length of 40% of all lines in the document.
|
|
||||||
|
|
||||||
Table of Contents
|
Table of Contents
|
||||||
------------------
|
------------------
|
||||||
@ -488,26 +555,33 @@ at `mobileread <http://www.mobileread.com/forums/showthread.php?t=28313>`_.
|
|||||||
Convert TXT documents
|
Convert TXT documents
|
||||||
~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
TXT documents have no well defined way to specify formatting like bold, italics, etc, or document structure like paragraphs, headings, sections and so on.
|
TXT documents have no well defined way to specify formatting like bold, italics, etc, or document
|
||||||
Since TXT documents provide no way to explicitly mark parts of
|
structure like paragraphs, headings, sections and so on, but there are a variety of conventions commonly
|
||||||
the text, by default |app| only groups lines in the input document into paragraphs. The default is to assume one or
|
used. By default |app| attempts automatic detection of the correct formatting and markup based on those
|
||||||
more blank lines are a paragraph boundary::
|
conventions.
|
||||||
|
|
||||||
|
TXT input supports a number of options to differentiate how paragraphs are detected.
|
||||||
|
|
||||||
|
:guilabel:`Paragraph Style: Auto`
|
||||||
|
Analyzes the text file and attempts to automatically determine how paragraphs are defined. This
|
||||||
|
option will generally work fine, if you achieve undesirable results try one of the manual options.
|
||||||
|
|
||||||
|
:guilabel:`Paragraph Style: Block`
|
||||||
|
Assumes one or more blank lines are a paragraph boundary::
|
||||||
|
|
||||||
This is the first.
|
This is the first.
|
||||||
|
|
||||||
This is the
|
This is the
|
||||||
second paragraph.
|
second paragraph.
|
||||||
|
|
||||||
TXT input supports a number of options to differentiate how paragraphs are detected.
|
:guilabel:`Paragraph Style: Single`
|
||||||
|
|
||||||
:guilabel:`Treat each line as a paragraph`
|
|
||||||
Assumes that every line is a paragraph::
|
Assumes that every line is a paragraph::
|
||||||
|
|
||||||
This is the first.
|
This is the first.
|
||||||
This is the second.
|
This is the second.
|
||||||
This is the third.
|
This is the third.
|
||||||
|
|
||||||
:guilabel:`Assume print formatting`
|
:guilabel:`Paragraph Style: Print`
|
||||||
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
|
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
|
||||||
the next line that starts with an indent is reached::
|
the next line that starts with an indent is reached::
|
||||||
|
|
||||||
@ -518,13 +592,28 @@ TXT input supports a number of options to differentiate how paragraphs are detec
|
|||||||
This is the
|
This is the
|
||||||
third.
|
third.
|
||||||
|
|
||||||
:guilabel:`Process using markdown`
|
:guilabel:`Paragraph Style: Unformatted`
|
||||||
|
Assumes that the document has no formatting, but does use hard line breaks. Punctuation
|
||||||
|
and median line length are used to attempt to re-create paragraphs.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: Auto`
|
||||||
|
Attemtps to detect the type of formatting markup being used. If no markup is used then heuristic
|
||||||
|
formatting will be applied.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: Heuristic`
|
||||||
|
Analyses the document for common chapter headings, scene breaks, and italicized words and applies the
|
||||||
|
appropriate html markup during conversion.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: Markdown`
|
||||||
|app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
|
|app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
|
||||||
allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
|
allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
|
||||||
lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection
|
lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection
|
||||||
expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document.
|
expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document.
|
||||||
You can learn more about the markdown syntax at `daringfireball <http://daringfireball.net/projects/markdown/syntax>`_.
|
You can learn more about the markdown syntax at `daringfireball <http://daringfireball.net/projects/markdown/syntax>`_.
|
||||||
|
|
||||||
|
:guilabel:`Formatting Style: None`
|
||||||
|
Applies no special formatting to the text, the document is converted to html with no other changes.
|
||||||
|
|
||||||
|
|
||||||
Convert PDF documents
|
Convert PDF documents
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -107,10 +107,10 @@ My device is not being detected by |app|?
|
|||||||
Follow these steps to find the problem:
|
Follow these steps to find the problem:
|
||||||
|
|
||||||
* Make sure that you are connecting only a single device to your computer at a time. Do not have another |app| supported device like an iPhone/iPad etc. at the same time.
|
* Make sure that you are connecting only a single device to your computer at a time. Do not have another |app| supported device like an iPhone/iPad etc. at the same time.
|
||||||
* Make sure you are running the latest version of |app|. The latest version can always be downloaded from `http://calibre-ebook.com/download`_.
|
* Make sure you are running the latest version of |app|. The latest version can always be downloaded from `the calibre website <http://calibre-ebook.com/download>`_.
|
||||||
* Ensure your operating system is seeing the device. That is, the device should be mounted as a disk that you can access using Windows explorer or whatever the file management program on your computer is
|
* Ensure your operating system is seeing the device. That is, the device should be mounted as a disk that you can access using Windows explorer or whatever the file management program on your computer is
|
||||||
* In calibre, go to Preferences->Plugins->Device Interface plugin and make sure the plugin for your device is enabled.
|
* In calibre, go to Preferences->Plugins->Device Interface plugin and make sure the plugin for your device is enabled.
|
||||||
* If all the above steps fail, go to Preferences->Miscellaneous and click debug device detection with your device attached and post the output as a ticket on `http://bugs.calibre-ebook.com`_.
|
* If all the above steps fail, go to Preferences->Miscellaneous and click debug device detection with your device attached and post the output as a ticket on `the calibre bug tracker <http://bugs.calibre-ebook.com>`_.
|
||||||
|
|
||||||
How does |app| manage collections on my SONY reader?
|
How does |app| manage collections on my SONY reader?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -21,7 +21,7 @@ This is, inevitably, going to be somewhat technical- after all, regular expressi
|
|||||||
Where in |app| can you use regular expressions?
|
Where in |app| can you use regular expressions?
|
||||||
---------------------------------------------------
|
---------------------------------------------------
|
||||||
|
|
||||||
There are a few places |app| uses regular expressions. There's the header/footer removal in conversion options, metadata detection from filenames in the import settings and, since last version, there's the option to use regular expressions to search and replace in metadata of multiple books.
|
There are a few places |app| uses regular expressions. There's the Search & Replace in conversion options, metadata detection from filenames in the import settings and Search & Replace when editing the metadata of books in bulk.
|
||||||
|
|
||||||
What on earth *is* a regular expression?
|
What on earth *is* a regular expression?
|
||||||
------------------------------------------------
|
------------------------------------------------
|
||||||
@ -94,7 +94,7 @@ I think I'm beginning to understand these regular expressions now... how do I us
|
|||||||
Conversions
|
Conversions
|
||||||
^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
Let's begin with the conversion settings, which is really neat. In the structure detection part, you can input a regexp (short for regular expression) that describes the header or footer string that will be removed during the conversion. The neat part is the wizard. Click on the wizard staff and you get a preview of what |app| "sees" during the conversion process. Scroll down to the header or footer you want to remove, select and copy it, paste it into the regexp field on top of the window. If there are variable parts, like page numbers or so, use sets and quantifiers to cover those, and while you're at it, remember to escape special characters, if there are some. Hit the button labeled :guilabel:`Test` and |app| highlights the parts it would remove were you to use the regexp. Once you're satisfied, hit OK and convert. Be careful if your conversion source has tags like this example::
|
Let's begin with the conversion settings, which is really neat. In the Search and Replace part, you can input a regexp (short for regular expression) that describes the string that will be replaced during the conversion. The neat part is the wizard. Click on the wizard staff and you get a preview of what |app| "sees" during the conversion process. Scroll down to the string you want to remove, select and copy it, paste it into the regexp field on top of the window. If there are variable parts, like page numbers or so, use sets and quantifiers to cover those, and while you're at it, remember to escape special characters, if there are some. Hit the button labeled :guilabel:`Test` and |app| highlights the parts it would replace were you to use the regexp. Once you're satisfied, hit OK and convert. Be careful if your conversion source has tags like this example::
|
||||||
|
|
||||||
Maybe, but the cops feel like you do, Anita. What's one more dead vampire?
|
Maybe, but the cops feel like you do, Anita. What's one more dead vampire?
|
||||||
New laws don't change that. </p>
|
New laws don't change that. </p>
|
||||||
@ -104,7 +104,7 @@ Let's begin with the conversion settings, which is really neat. In the structure
|
|||||||
<p class="calibre4"> It had only been two years since Addison v. Clark.
|
<p class="calibre4"> It had only been two years since Addison v. Clark.
|
||||||
The court case gave us a revised version of what life was
|
The court case gave us a revised version of what life was
|
||||||
|
|
||||||
(shamelessly ripped out of `this thread <http://www.mobileread.com/forums/showthread.php?t=75594">`_). You'd have to remove some of the tags as well. In this example, I'd recommend beginning with the tag ``<b class="calibre2">``, now you have to end with the corresponding closing tag (opening tags are ``<tag>``, closing tags are ``</tag>``), which is simply the next ``</b>`` in this case. (Refer to a good HTML manual or ask in the forum if you are unclear on this point.) The opening tag can be described using ``<b.*?>``, the closing tag using ``</b>``, thus we could remove everything between those tags using ``<b.*?>.*?</b>``. But using this expression would be a bad idea, because it removes everything enclosed by <b>- tags (which, by the way, render the enclosed text in bold print), and it's a fair bet that we'll remove portions of the book in this way. Instead, include the beginning of the enclosed string as well, making the regular expression ``<b.*?>\s*Generated\s+by\s+ABC\s+Amber\s+LIT.*?</b>`` The ``\s`` with quantifiers are included here instead of explicitly using the spaces as seen in the string to catch any variations of the string that might occur. Remember to check what |app| will remove to make sure you don't remove any portions you want to keep if you test a new expression. If you only check one occurrence, you might miss a mismatch somewhere else in the text. Also note that should you accidentally remove more or fewer tags than you actually wanted to, |app| tries to repair the damaged code after doing the header/footer removal.
|
(shamelessly ripped out of `this thread <http://www.mobileread.com/forums/showthread.php?t=75594">`_). You'd have to remove some of the tags as well. In this example, I'd recommend beginning with the tag ``<b class="calibre2">``, now you have to end with the corresponding closing tag (opening tags are ``<tag>``, closing tags are ``</tag>``), which is simply the next ``</b>`` in this case. (Refer to a good HTML manual or ask in the forum if you are unclear on this point.) The opening tag can be described using ``<b.*?>``, the closing tag using ``</b>``, thus we could remove everything between those tags using ``<b.*?>.*?</b>``. But using this expression would be a bad idea, because it removes everything enclosed by <b>- tags (which, by the way, render the enclosed text in bold print), and it's a fair bet that we'll remove portions of the book in this way. Instead, include the beginning of the enclosed string as well, making the regular expression ``<b.*?>\s*Generated\s+by\s+ABC\s+Amber\s+LIT.*?</b>`` The ``\s`` with quantifiers are included here instead of explicitly using the spaces as seen in the string to catch any variations of the string that might occur. Remember to check what |app| will remove to make sure you don't remove any portions you want to keep if you test a new expression. If you only check one occurrence, you might miss a mismatch somewhere else in the text. Also note that should you accidentally remove more or fewer tags than you actually wanted to, |app| tries to repair the damaged code after doing the removal.
|
||||||
|
|
||||||
Adding books
|
Adding books
|
||||||
^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -52,9 +52,10 @@ def is_date_undefined(qt_or_dt):
|
|||||||
return True
|
return True
|
||||||
if hasattr(d, 'toString'):
|
if hasattr(d, 'toString'):
|
||||||
d = datetime(d.year(), d.month(), d.day(), tzinfo=utc_tz)
|
d = datetime(d.year(), d.month(), d.day(), tzinfo=utc_tz)
|
||||||
return d.year == UNDEFINED_DATE.year and \
|
return d.year < UNDEFINED_DATE.year or (
|
||||||
d.month == UNDEFINED_DATE.month and \
|
d.year == UNDEFINED_DATE.year and
|
||||||
d.day == UNDEFINED_DATE.day
|
d.month == UNDEFINED_DATE.month and
|
||||||
|
d.day == UNDEFINED_DATE.day)
|
||||||
|
|
||||||
def parse_date(date_string, assume_utc=False, as_utc=True, default=None):
|
def parse_date(date_string, assume_utc=False, as_utc=True, default=None):
|
||||||
'''
|
'''
|
||||||
|
@ -42,16 +42,16 @@ def supports_long_names(path):
|
|||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def shorten_component(s, byWhat):
|
def shorten_component(s, by_what):
|
||||||
l = len(s)
|
l = len(s)
|
||||||
if l < byWhat:
|
if l < by_what:
|
||||||
return s
|
return s
|
||||||
l = int((l-byWhat)/2)
|
l = (l - by_what)//2
|
||||||
if l <= 0:
|
if l <= 0:
|
||||||
return s
|
return s
|
||||||
return s[0:l] + s[-l:]
|
return s[:l] + s[-l:]
|
||||||
|
|
||||||
def shorten_components_to(length, components, more_to_take = 0):
|
def shorten_components_to(length, components, more_to_take=0):
|
||||||
filepath = os.sep.join(components)
|
filepath = os.sep.join(components)
|
||||||
extra = len(filepath) - (length - more_to_take)
|
extra = len(filepath) - (length - more_to_take)
|
||||||
if extra < 1:
|
if extra < 1:
|
||||||
@ -62,7 +62,7 @@ def shorten_components_to(length, components, more_to_take = 0):
|
|||||||
deltas.append(int(ceil(pct*extra)))
|
deltas.append(int(ceil(pct*extra)))
|
||||||
ans = []
|
ans = []
|
||||||
|
|
||||||
for i,x in enumerate(components):
|
for i, x in enumerate(components):
|
||||||
delta = deltas[i]
|
delta = deltas[i]
|
||||||
if delta > len(x):
|
if delta > len(x):
|
||||||
r = x[0] if x is components[-1] else ''
|
r = x[0] if x is components[-1] else ''
|
||||||
|
@ -75,7 +75,7 @@ class FormatterFunction(object):
|
|||||||
exc_type, exc_value, exc_traceback = sys.exc_info()
|
exc_type, exc_value, exc_traceback = sys.exc_info()
|
||||||
info = ': '.join(traceback.format_exception(exc_type, exc_value,
|
info = ': '.join(traceback.format_exception(exc_type, exc_value,
|
||||||
exc_traceback)[-2:]).replace('\n', '')
|
exc_traceback)[-2:]).replace('\n', '')
|
||||||
return _('Exception ' + info)
|
return _('Exception ') + info
|
||||||
|
|
||||||
all_builtin_functions = []
|
all_builtin_functions = []
|
||||||
class BuiltinFormatterFunction(FormatterFunction):
|
class BuiltinFormatterFunction(FormatterFunction):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user