mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge with ldolse preprocessdev
This commit is contained in:
commit
d20387ea74
BIN
resources/images/news/latimes.png
Normal file
BIN
resources/images/news/latimes.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 358 B |
@ -1,25 +1,25 @@
|
|||||||
# -*- coding: utf-8
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'Luis Hernandez'
|
__author__ = 'Luis Hernandez'
|
||||||
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||||
description = 'Periódico gratuito en español - v0.8 - 27 Jan 2011'
|
__version__ = 'v0.85'
|
||||||
|
__date__ = '31 January 2011'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
www.20minutos.es
|
www.20minutos.es
|
||||||
'''
|
'''
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'20 Minutos'
|
title = u'20 Minutos new'
|
||||||
publisher = u'Grupo 20 Minutos'
|
publisher = u'Grupo 20 Minutos'
|
||||||
|
|
||||||
__author__ = 'Luis Hernández'
|
__author__ = 'Luis Hernandez'
|
||||||
description = 'Periódico gratuito en español'
|
description = 'Free spanish newspaper'
|
||||||
cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
|
cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
|
||||||
|
|
||||||
oldest_article = 5
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
@ -29,6 +29,7 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
encoding = 'ISO-8859-1'
|
encoding = 'ISO-8859-1'
|
||||||
language = 'es'
|
language = 'es'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'id':['content','vinetas',]})
|
dict(name='div', attrs={'id':['content','vinetas',]})
|
||||||
@ -43,13 +44,21 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='ol', attrs={'class':['navigation',]})
|
dict(name='ol', attrs={'class':['navigation',]})
|
||||||
,dict(name='span', attrs={'class':['action']})
|
,dict(name='span', attrs={'class':['action']})
|
||||||
,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
|
,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','photo-gallery side-art-block','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
|
||||||
,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']})
|
,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']})
|
||||||
,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']})
|
,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']})
|
||||||
,dict(name='ul', attrs={'id':['site-links']})
|
,dict(name='ul', attrs={'id':['site-links']})
|
||||||
,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']})
|
,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
p{text-align: justify; font-size: 100%}
|
||||||
|
body{ text-align: left; font-size:100% }
|
||||||
|
h3{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(r'<a href="http://estaticos.*?[0-999]px;" target="_blank">', re.DOTALL), lambda m: '')]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Portada' , u'http://www.20minutos.es/rss/')
|
(u'Portada' , u'http://www.20minutos.es/rss/')
|
||||||
,(u'Nacional' , u'http://www.20minutos.es/rss/nacional/')
|
,(u'Nacional' , u'http://www.20minutos.es/rss/nacional/')
|
||||||
@ -65,6 +74,6 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
,(u'Empleo' , u'http://www.20minutos.es/rss/empleo/')
|
,(u'Empleo' , u'http://www.20minutos.es/rss/empleo/')
|
||||||
,(u'Cine' , u'http://www.20minutos.es/rss/cine/')
|
,(u'Cine' , u'http://www.20minutos.es/rss/cine/')
|
||||||
,(u'Musica' , u'http://www.20minutos.es/rss/musica/')
|
,(u'Musica' , u'http://www.20minutos.es/rss/musica/')
|
||||||
,(u'Vinetas' , u'http://www.20minutos.es/rss/vinetas/')
|
,(u'Vinetas' , u'http://www.20minutos.es/rss/vinetas/')
|
||||||
,(u'Comunidad20' , u'http://www.20minutos.es/rss/zona20/')
|
,(u'Comunidad20' , u'http://www.20minutos.es/rss/zona20/')
|
||||||
]
|
]
|
||||||
|
71
resources/recipes/cinco_dias.recipe
Normal file
71
resources/recipes/cinco_dias.recipe
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Luis Hernandez'
|
||||||
|
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||||
|
__version__ = 'v1.2'
|
||||||
|
__date__ = '31 January 2011'
|
||||||
|
|
||||||
|
'''
|
||||||
|
http://www.cincodias.com/
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Cinco Dias'
|
||||||
|
publisher = u'Grupo Prisa'
|
||||||
|
|
||||||
|
__author__ = 'Luis Hernandez'
|
||||||
|
description = 'spanish web about money and bussiness, free edition'
|
||||||
|
|
||||||
|
cover_url = 'http://www.prisa.com/images/logos/logo_cinco_dias.gif'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
language = 'es'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
encoding = 'ISO-8859-1'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':['cab_articulo cab_noticia','pos_3','txt_noticia','mod_despiece']})
|
||||||
|
,dict(name='p', attrs={'class':['cintillo']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='div' , attrs={'class':['publi_h']})
|
||||||
|
remove_tags_after = dict(name='div' , attrs={'class':['tab_util util_estadisticas']})
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['util-1','util-2','util-3','inner estirar','inner1','inner2','inner3','cont','tab_util util_estadisticas','tab_util util_enviar','mod_list_inf','mod_similares','mod_divisas','mod_sectores','mod_termometro','mod post','mod_img','mod_txt','nivel estirar','barra estirar','info_brujula btnBrujula','utilidad_brujula estirar']})
|
||||||
|
,dict(name='li', attrs={'class':['lnk-fcbook','lnk-retweet','lnk-meneame','desplegable','comentarios','list-options','estirar']})
|
||||||
|
,dict(name='ul', attrs={'class':['lista-izquierda','list-options','estirar']})
|
||||||
|
,dict(name='p', attrs={'class':['autor']})
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
p{text-align: justify; font-size: 100%}
|
||||||
|
body{ text-align: left; font-size:100% }
|
||||||
|
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
|
||||||
|
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Ultima Hora' , u'http://www.cincodias.com/rss/feed.html?feedId=17029')
|
||||||
|
,(u'Empresas' , u'http://www.cincodias.com/rss/feed.html?feedId=19')
|
||||||
|
,(u'Mercados' , u'http://www.cincodias.com/rss/feed.html?feedId=20')
|
||||||
|
,(u'Economia' , u'http://www.cincodias.com/rss/feed.html?feedId=21')
|
||||||
|
,(u'Tecnorama' , u'http://www.cincodias.com/rss/feed.html?feedId=17230')
|
||||||
|
,(u'Tecnologia' , u'http://www.cincodias.com/rss/feed.html?feedId=17106')
|
||||||
|
,(u'Finanzas Personales' , u'http://www.cincodias.com/rss/feed.html?feedId=22')
|
||||||
|
,(u'Fiscalidad' , u'http://www.cincodias.com/rss/feed.html?feedId=17107')
|
||||||
|
,(u'Vivienda' , u'http://www.cincodias.com/rss/feed.html?feedId=17108')
|
||||||
|
,(u'Tendencias' , u'http://www.cincodias.com/rss/feed.html?feedId=17109')
|
||||||
|
,(u'Empleo' , u'http://www.cincodias.com/rss/feed.html?feedId=17110')
|
||||||
|
,(u'IBEX 35' , u'http://www.cincodias.com/rss/feed.html?feedId=17125')
|
||||||
|
,(u'Sectores' , u'http://www.cincodias.com/rss/feed.html?feedId=17126')
|
||||||
|
,(u'Opinion' , u'http://www.cincodias.com/rss/feed.html?feedId=17105')
|
||||||
|
]
|
@ -143,7 +143,7 @@ def add_pipeline_options(parser, plumber):
|
|||||||
' patterns. Disabled by default. Use %s to enable. '
|
' patterns. Disabled by default. Use %s to enable. '
|
||||||
' Individual actions can be disabled with the %s options.')
|
' Individual actions can be disabled with the %s options.')
|
||||||
% ('--enable-heuristics', '--disable-*'),
|
% ('--enable-heuristics', '--disable-*'),
|
||||||
['enable_heuristics', 'replace_scene_breaks'] + HEURISTIC_OPTIONS
|
['enable_heuristics', 'replace_scene_breaks'] + HEURISTIC_OPTIONS
|
||||||
),
|
),
|
||||||
|
|
||||||
'SEARCH AND REPLACE' : (
|
'SEARCH AND REPLACE' : (
|
||||||
|
@ -532,7 +532,7 @@ OptionRecommendation(name='format_scene_breaks',
|
|||||||
'horizontal rules.')),
|
'horizontal rules.')),
|
||||||
|
|
||||||
OptionRecommendation(name='replace_scene_breaks',
|
OptionRecommendation(name='replace_scene_breaks',
|
||||||
recommended_value='', level=OptionRecommendation.LOW,
|
recommended_value=None, level=OptionRecommendation.LOW,
|
||||||
help=_('Replace scene breaks with the specified text.')),
|
help=_('Replace scene breaks with the specified text.')),
|
||||||
|
|
||||||
OptionRecommendation(name='dehyphenate',
|
OptionRecommendation(name='dehyphenate',
|
||||||
|
@ -26,9 +26,14 @@ class HeuristicProcessor(object):
|
|||||||
self.blanks_deleted = False
|
self.blanks_deleted = False
|
||||||
self.blanks_between_paragraphs = False
|
self.blanks_between_paragraphs = False
|
||||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|spacer)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||||
|
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
||||||
|
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||||
|
self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||||
|
self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
|
||||||
|
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
@ -187,19 +192,17 @@ class HeuristicProcessor(object):
|
|||||||
|
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = "(?=<(p|div))"
|
init_lookahead = "(?=<(p|div))"
|
||||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
chapter_line_open = self.line_open
|
||||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||||
chapter_header_open = r"(?P<chap>"
|
chapter_header_open = r"(?P<chap>"
|
||||||
title_header_open = r"(?P<title>"
|
title_header_open = r"(?P<title>"
|
||||||
chapter_header_close = ")\s*"
|
chapter_header_close = ")\s*"
|
||||||
title_header_close = ")"
|
title_header_close = ")"
|
||||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
chapter_line_close = self.line_close
|
||||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||||
|
|
||||||
is_pdftohtml = self.is_pdftohtml(html)
|
is_pdftohtml = self.is_pdftohtml(html)
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
|
||||||
chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
|
||||||
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
|
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
|
||||||
title_line_close = "\s*</(?P=outer2)>"
|
title_line_close = "\s*</(?P=outer2)>"
|
||||||
|
|
||||||
@ -374,13 +377,15 @@ class HeuristicProcessor(object):
|
|||||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
# Delete microsoft 'smart' tags
|
# Delete microsoft 'smart' tags
|
||||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||||
# Delete self closing paragraph tags
|
# Re-open self closing paragraph tags
|
||||||
html = re.sub('<p\s?/>', '', html)
|
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
||||||
# Get rid of empty span, bold, font, em, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
||||||
|
# Empty heading tags
|
||||||
|
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
|
||||||
self.deleted_nbsps = True
|
self.deleted_nbsps = True
|
||||||
return html
|
return html
|
||||||
|
|
||||||
@ -418,33 +423,98 @@ class HeuristicProcessor(object):
|
|||||||
if getattr(self.extra_opts, option, False):
|
if getattr(self.extra_opts, option, False):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def detect_blank_formatting(self, html):
|
|
||||||
blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){1,}(?=\s*<h\d)', re.IGNORECASE)
|
|
||||||
blanks_after_headings = re.compile(r'(?<=</h\d>)(\s*<p[^>]*>\s*</p>){1,}', re.IGNORECASE)
|
|
||||||
|
|
||||||
def markup_spacers(match):
|
def merge_blanks(self, html, blanks_count=None):
|
||||||
blanks = match.group(0)
|
base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
|
||||||
blanks = self.blankreg.sub('\n<p class="spacer"> </p>', blanks)
|
em_per_line = 1.5 # Add another 1.5 em for each additional blank
|
||||||
return blanks
|
|
||||||
html = blanks_before_headings.sub(markup_spacers, html)
|
def merge_matches(match):
|
||||||
html = blanks_after_headings.sub(markup_spacers, html)
|
to_merge = match.group(0)
|
||||||
|
lines = float(len(self.single_blank.findall(to_merge))) - 1.
|
||||||
|
em = base_em + (em_per_line * lines)
|
||||||
|
if to_merge.find('whitespace'):
|
||||||
|
newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
|
||||||
|
else:
|
||||||
|
newline = self.any_multi_blank.sub('\n<p class="softbreak'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
|
||||||
|
return newline
|
||||||
|
|
||||||
|
html = self.any_multi_blank.sub(merge_matches, html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def detect_whitespace(self, html):
|
||||||
|
blanks_around_headings = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
|
||||||
|
blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
|
||||||
|
|
||||||
|
def merge_header_whitespace(match):
|
||||||
|
initblanks = match.group('initparas')
|
||||||
|
endblanks = match.group('initparas')
|
||||||
|
heading = match.group('heading')
|
||||||
|
top_margin = ''
|
||||||
|
bottom_margin = ''
|
||||||
|
if initblanks is not None:
|
||||||
|
top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||||
|
if endblanks is not None:
|
||||||
|
bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||||
|
|
||||||
|
if initblanks == None and endblanks == None:
|
||||||
|
return heading
|
||||||
|
else:
|
||||||
|
heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
|
||||||
|
return heading
|
||||||
|
|
||||||
|
html = blanks_around_headings.sub(merge_header_whitespace, html)
|
||||||
|
|
||||||
|
def markup_whitespaces(match):
|
||||||
|
blanks = match.group(0)
|
||||||
|
blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
|
||||||
|
return blanks
|
||||||
|
|
||||||
|
html = blanks_n_nopunct.sub(markup_whitespaces, html)
|
||||||
if self.html_preprocess_sections > self.min_chapters:
|
if self.html_preprocess_sections > self.min_chapters:
|
||||||
html = re.sub('(?si)^.*?(?=<h\d)', markup_spacers, html)
|
html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def detect_soft_breaks(self, html):
|
def detect_soft_breaks(self, html):
|
||||||
if not self.blanks_deleted and self.blanks_between_paragraphs:
|
if not self.blanks_deleted and self.blanks_between_paragraphs:
|
||||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
|
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
else:
|
else:
|
||||||
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
|
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def markup_user_break(self, replacement_break):
|
||||||
|
'''
|
||||||
|
Takes string a user supplies and wraps it in markup that will be centered with
|
||||||
|
appropriate margins. <hr> and <img> tags are allowed. If the user specifies
|
||||||
|
a style with width attributes in the <hr> tag then the appropriate margins are
|
||||||
|
applied to wrapping divs. This is because many ebook devices don't support margin:auto
|
||||||
|
All other html is converted to text.
|
||||||
|
'''
|
||||||
|
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em">'
|
||||||
|
if re.findall('(<|>)', replacement_break):
|
||||||
|
if re.match('^<hr', replacement_break):
|
||||||
|
if replacement_break.find('width') != -1:
|
||||||
|
width = int(re.sub('.*?width(:|=)(?P<wnum>\d+).*', '\g<wnum>', replacement_break))
|
||||||
|
divpercent = (100 - width) / 2
|
||||||
|
hr_open = re.sub('45', str(divpercent), hr_open)
|
||||||
|
scene_break = hr_open+replacement_break+'</div>'
|
||||||
|
else:
|
||||||
|
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||||
|
elif re.match('^<img', replacement_break):
|
||||||
|
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||||
|
else:
|
||||||
|
replacement_break = html2text(replacement_break)
|
||||||
|
replacement_break = re.sub('\s', ' ', replacement_break)
|
||||||
|
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||||
|
else:
|
||||||
|
replacement_break = re.sub('\s', ' ', replacement_break)
|
||||||
|
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||||
|
|
||||||
|
return scene_break
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, html):
|
def __call__(self, html):
|
||||||
self.log.debug("********* Heuristic processing HTML *********")
|
self.log.debug("********* Heuristic processing HTML *********")
|
||||||
|
|
||||||
# Count the words in the document to estimate how many chapters to look for and whether
|
# Count the words in the document to estimate how many chapters to look for and whether
|
||||||
# other types of processing are attempted
|
# other types of processing are attempted
|
||||||
try:
|
try:
|
||||||
@ -458,7 +528,7 @@ class HeuristicProcessor(object):
|
|||||||
|
|
||||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
html = self.arrange_htm_line_endings(html)
|
html = self.arrange_htm_line_endings(html)
|
||||||
|
#self.dump(html, 'after_arrange_line_endings')
|
||||||
if self.cleanup_required():
|
if self.cleanup_required():
|
||||||
###### Check Markup ######
|
###### Check Markup ######
|
||||||
#
|
#
|
||||||
@ -478,6 +548,11 @@ class HeuristicProcessor(object):
|
|||||||
# fix indents must run before this step, as it removes non-breaking spaces
|
# fix indents must run before this step, as it removes non-breaking spaces
|
||||||
html = self.cleanup_markup(html)
|
html = self.cleanup_markup(html)
|
||||||
|
|
||||||
|
is_pdftohtml = self.is_pdftohtml(html)
|
||||||
|
if is_pdftohtml:
|
||||||
|
self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
||||||
|
self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
||||||
|
|
||||||
# ADE doesn't render <br />, change to empty paragraphs
|
# ADE doesn't render <br />, change to empty paragraphs
|
||||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||||
|
|
||||||
@ -489,6 +564,7 @@ class HeuristicProcessor(object):
|
|||||||
|
|
||||||
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
|
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
|
||||||
|
#self.dump(html, 'after_chapter_markup')
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||||
html = self.markup_italicis(html)
|
html = self.markup_italicis(html)
|
||||||
@ -498,7 +574,7 @@ class HeuristicProcessor(object):
|
|||||||
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
self.log.debug("deleting blank lines")
|
self.log.debug("deleting blank lines")
|
||||||
self.blanks_deleted = True
|
self.blanks_deleted = True
|
||||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
|
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
html = self.blankreg.sub('', html)
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
# Determine line ending type
|
# Determine line ending type
|
||||||
@ -539,7 +615,7 @@ class HeuristicProcessor(object):
|
|||||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
self.log.debug("Looking for more split points based on punctuation,"
|
self.log.debug("Looking for more split points based on punctuation,"
|
||||||
" currently have " + unicode(self.html_preprocess_sections))
|
" currently have " + unicode(self.html_preprocess_sections))
|
||||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||||
@ -549,14 +625,32 @@ class HeuristicProcessor(object):
|
|||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
|
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
|
||||||
|
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
|
||||||
|
# Multiple sequential blank paragraphs are merged with appropriate margins
|
||||||
|
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
|
||||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
html = self.detect_blank_formatting(html)
|
html = self.detect_whitespace(html)
|
||||||
html = self.detect_soft_breaks(html)
|
html = self.detect_soft_breaks(html)
|
||||||
# Center separator lines
|
blanks_count = len(self.any_multi_blank.findall(html))
|
||||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
|
if blanks_count >= 1:
|
||||||
#html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
|
html = self.merge_blanks(html, blanks_count)
|
||||||
|
scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
|
||||||
|
scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||||
|
# If the user has enabled scene break replacement, then either softbreaks
|
||||||
|
# or 'hard' scene breaks are replaced, depending on which is in use
|
||||||
|
# Otherwise separator lines are centered, use a bit larger margin in this case
|
||||||
|
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
|
||||||
|
if replacement_break is not None:
|
||||||
|
replacement_break = self.markup_user_break(replacement_break)
|
||||||
|
if len(scene_break.findall(html)) >= 1:
|
||||||
|
html = scene_break.sub(replacement_break, html)
|
||||||
|
else:
|
||||||
|
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
|
||||||
|
else:
|
||||||
|
html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
|
||||||
|
|
||||||
if self.deleted_nbsps:
|
if self.deleted_nbsps:
|
||||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
# put back non-breaking spaces in empty paragraphs so they render correctly
|
||||||
html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||||
return html
|
return html
|
61
src/calibre/ebooks/metadata/sources/base.py
Normal file
61
src/calibre/ebooks/metadata/sources/base.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.customize import Plugin
|
||||||
|
|
||||||
|
class Source(Plugin):
|
||||||
|
|
||||||
|
type = _('Metadata source')
|
||||||
|
author = 'Kovid Goyal'
|
||||||
|
|
||||||
|
supported_platforms = ['windows', 'osx', 'linux']
|
||||||
|
|
||||||
|
result_of_identify_is_complete = True
|
||||||
|
|
||||||
|
def get_author_tokens(self, authors):
|
||||||
|
'Take a list of authors and return a list of tokens useful for a '
|
||||||
|
'AND search query'
|
||||||
|
# Leave ' in there for Irish names
|
||||||
|
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
|
||||||
|
for au in authors:
|
||||||
|
for tok in au.split():
|
||||||
|
yield pat.sub('', tok)
|
||||||
|
|
||||||
|
def split_jobs(self, jobs, num):
|
||||||
|
'Split a list of jobs into at most num groups, as evenly as possible'
|
||||||
|
groups = [[] for i in range(num)]
|
||||||
|
jobs = list(jobs)
|
||||||
|
while jobs:
|
||||||
|
for gr in groups:
|
||||||
|
try:
|
||||||
|
job = jobs.pop()
|
||||||
|
except IndexError:
|
||||||
|
break
|
||||||
|
gr.append(job)
|
||||||
|
return [g for g in groups if g]
|
||||||
|
|
||||||
|
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||||
|
'''
|
||||||
|
Identify a book by its title/author/isbn/etc.
|
||||||
|
|
||||||
|
:param log: A log object, use it to output debugging information/errors
|
||||||
|
:param result_queue: A result Queue, results should be put into it.
|
||||||
|
Each result is a Metadata object
|
||||||
|
:param abort: If abort.is_set() returns True, abort further processing
|
||||||
|
and return as soon as possible
|
||||||
|
:param title: The title of the book, can be None
|
||||||
|
:param authors: A list of authors of the book, can be None
|
||||||
|
:param identifiers: A dictionary of other identifiers, most commonly
|
||||||
|
{'isbn':'1234...'}
|
||||||
|
:return: None if no errors occurred, otherwise a unicode representation
|
||||||
|
of the error suitable for showing to the user
|
||||||
|
|
||||||
|
'''
|
||||||
|
return None
|
||||||
|
|
215
src/calibre/ebooks/metadata/sources/google.py
Normal file
215
src/calibre/ebooks/metadata/sources/google.py
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import time
|
||||||
|
from urllib import urlencode
|
||||||
|
from functools import partial
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata.sources import Source
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.utils.date import parse_date, utcnow
|
||||||
|
from calibre import browser, as_unicode
|
||||||
|
|
||||||
|
NAMESPACES = {
|
||||||
|
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||||
|
'atom' : 'http://www.w3.org/2005/Atom',
|
||||||
|
'dc': 'http://purl.org/dc/terms'
|
||||||
|
}
|
||||||
|
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||||
|
|
||||||
|
total_results = XPath('//openSearch:totalResults')
|
||||||
|
start_index = XPath('//openSearch:startIndex')
|
||||||
|
items_per_page = XPath('//openSearch:itemsPerPage')
|
||||||
|
entry = XPath('//atom:entry')
|
||||||
|
entry_id = XPath('descendant::atom:id')
|
||||||
|
creator = XPath('descendant::dc:creator')
|
||||||
|
identifier = XPath('descendant::dc:identifier')
|
||||||
|
title = XPath('descendant::dc:title')
|
||||||
|
date = XPath('descendant::dc:date')
|
||||||
|
publisher = XPath('descendant::dc:publisher')
|
||||||
|
subject = XPath('descendant::dc:subject')
|
||||||
|
description = XPath('descendant::dc:description')
|
||||||
|
language = XPath('descendant::dc:language')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def to_metadata(browser, log, entry_):
|
||||||
|
|
||||||
|
def get_text(extra, x):
|
||||||
|
try:
|
||||||
|
ans = x(extra)
|
||||||
|
if ans:
|
||||||
|
ans = ans[0].text
|
||||||
|
if ans and ans.strip():
|
||||||
|
return ans.strip()
|
||||||
|
except:
|
||||||
|
log.exception('Programming error:')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
id_url = entry_id(entry_)[0].text
|
||||||
|
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||||
|
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||||
|
if not authors:
|
||||||
|
authors = [_('Unknown')]
|
||||||
|
if not id_url or not title:
|
||||||
|
# Silently discard this entry
|
||||||
|
return None
|
||||||
|
|
||||||
|
mi = Metadata(title_, authors)
|
||||||
|
try:
|
||||||
|
raw = browser.open(id_url).read()
|
||||||
|
feed = etree.fromstring(raw)
|
||||||
|
extra = entry(feed)[0]
|
||||||
|
except:
|
||||||
|
log.exception('Failed to get additional details for', mi.title)
|
||||||
|
return mi
|
||||||
|
|
||||||
|
mi.comments = get_text(extra, description)
|
||||||
|
#mi.language = get_text(extra, language)
|
||||||
|
mi.publisher = get_text(extra, publisher)
|
||||||
|
|
||||||
|
# Author sort
|
||||||
|
for x in creator(extra):
|
||||||
|
for key, val in x.attrib.items():
|
||||||
|
if key.endswith('file-as') and val and val.strip():
|
||||||
|
mi.author_sort = val
|
||||||
|
break
|
||||||
|
# ISBN
|
||||||
|
isbns = []
|
||||||
|
for x in identifier(extra):
|
||||||
|
t = str(x.text).strip()
|
||||||
|
if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
|
||||||
|
if t[:5].upper() == 'ISBN:':
|
||||||
|
isbns.append(t[5:])
|
||||||
|
if isbns:
|
||||||
|
mi.isbn = sorted(isbns, key=len)[-1]
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
try:
|
||||||
|
btags = [x.text for x in subject(extra) if x.text]
|
||||||
|
tags = []
|
||||||
|
for t in btags:
|
||||||
|
tags.extend([y.strip() for y in t.split('/')])
|
||||||
|
tags = list(sorted(list(set(tags))))
|
||||||
|
except:
|
||||||
|
log.exception('Failed to parse tags:')
|
||||||
|
tags = []
|
||||||
|
if tags:
|
||||||
|
mi.tags = [x.replace(',', ';') for x in tags]
|
||||||
|
|
||||||
|
# pubdate
|
||||||
|
pubdate = get_text(extra, date)
|
||||||
|
if pubdate:
|
||||||
|
try:
|
||||||
|
default = utcnow().replace(day=15)
|
||||||
|
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||||
|
except:
|
||||||
|
log.exception('Failed to parse pubdate')
|
||||||
|
|
||||||
|
|
||||||
|
return mi
|
||||||
|
|
||||||
|
class Worker(Thread):
|
||||||
|
|
||||||
|
def __init__(self, log, entries, abort, result_queue):
|
||||||
|
self.browser, self.log, self.entries = browser(), log, entries
|
||||||
|
self.abort, self.result_queue = abort, result_queue
|
||||||
|
Thread.__init__(self)
|
||||||
|
self.daemon = True
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
for i in self.entries:
|
||||||
|
try:
|
||||||
|
ans = to_metadata(self.browser, self.log, i)
|
||||||
|
if ans is not None:
|
||||||
|
self.result_queue.put(ans)
|
||||||
|
except:
|
||||||
|
self.log.exception(
|
||||||
|
'Failed to get metadata for identify entry:',
|
||||||
|
etree.tostring(i))
|
||||||
|
if self.abort.is_set():
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleBooks(Source):
|
||||||
|
|
||||||
|
name = 'Google Books'
|
||||||
|
|
||||||
|
def create_query(self, log, title=None, authors=None, identifiers={},
|
||||||
|
start_index=1):
|
||||||
|
BASE_URL = 'http://books.google.com/books/feeds/volumes?'
|
||||||
|
isbn = identifiers.get('isbn', None)
|
||||||
|
q = ''
|
||||||
|
if isbn is not None:
|
||||||
|
q += 'isbn:'+isbn
|
||||||
|
elif title or authors:
|
||||||
|
def build_term(prefix, parts):
|
||||||
|
return ' '.join('in'+prefix + ':' + x for x in parts)
|
||||||
|
if title is not None:
|
||||||
|
q += build_term('title', title.split())
|
||||||
|
if authors:
|
||||||
|
q += ('+' if q else '')+build_term('author',
|
||||||
|
self.get_author_tokens(authors))
|
||||||
|
|
||||||
|
if isinstance(q, unicode):
|
||||||
|
q = q.encode('utf-8')
|
||||||
|
if not q:
|
||||||
|
return None
|
||||||
|
return BASE_URL+urlencode({
|
||||||
|
'q':q,
|
||||||
|
'max-results':20,
|
||||||
|
'start-index':start_index,
|
||||||
|
'min-viewability':'none',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||||
|
query = self.create_query(log, title=title, authors=authors,
|
||||||
|
identifiers=identifiers)
|
||||||
|
try:
|
||||||
|
raw = browser().open_novisit(query).read()
|
||||||
|
except Exception, e:
|
||||||
|
log.exception('Failed to make identify query: %r'%query)
|
||||||
|
return as_unicode(e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
|
feed = etree.fromstring(raw, parser=parser)
|
||||||
|
entries = entry(feed)
|
||||||
|
except Exception, e:
|
||||||
|
log.exception('Failed to parse identify results')
|
||||||
|
return as_unicode(e)
|
||||||
|
|
||||||
|
|
||||||
|
groups = self.split_jobs(entries, 5) # At most 5 threads
|
||||||
|
if not groups:
|
||||||
|
return
|
||||||
|
workers = [Worker(log, entries, abort, result_queue) for entries in
|
||||||
|
groups]
|
||||||
|
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
|
|
||||||
|
for worker in workers: worker.start()
|
||||||
|
|
||||||
|
has_alive_worker = True
|
||||||
|
while has_alive_worker and not abort.is_set():
|
||||||
|
has_alive_worker = False
|
||||||
|
for worker in workers:
|
||||||
|
if worker.is_alive():
|
||||||
|
has_alive_worker = True
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -258,7 +258,6 @@ class Config(ResizableDialog, Ui_Dialog):
|
|||||||
if not w.pre_commit_check():
|
if not w.pre_commit_check():
|
||||||
return
|
return
|
||||||
x = w.commit(save_defaults=False)
|
x = w.commit(save_defaults=False)
|
||||||
print x
|
|
||||||
recs.update(x)
|
recs.update(x)
|
||||||
self.opf_file, self.cover_file = self.mw.opf_file, self.mw.cover_file
|
self.opf_file, self.cover_file = self.mw.opf_file, self.mw.cover_file
|
||||||
self._recommendations = recs
|
self._recommendations = recs
|
||||||
|
@ -429,10 +429,12 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
|||||||
old_extensions.add(ext)
|
old_extensions.add(ext)
|
||||||
for ext in new_extensions:
|
for ext in new_extensions:
|
||||||
self.db.add_format(self.row, ext, open(paths[ext], 'rb'), notify=False)
|
self.db.add_format(self.row, ext, open(paths[ext], 'rb'), notify=False)
|
||||||
db_extensions = set([f.lower() for f in self.db.formats(self.row).split(',')])
|
dbfmts = self.db.formats(self.row)
|
||||||
|
db_extensions = set([f.lower() for f in (dbfmts.split(',') if dbfmts
|
||||||
|
else [])])
|
||||||
extensions = new_extensions.union(old_extensions)
|
extensions = new_extensions.union(old_extensions)
|
||||||
for ext in db_extensions:
|
for ext in db_extensions:
|
||||||
if ext not in extensions:
|
if ext not in extensions and ext in self.original_formats:
|
||||||
self.db.remove_format(self.row, ext, notify=False)
|
self.db.remove_format(self.row, ext, notify=False)
|
||||||
|
|
||||||
def show_format(self, item, *args):
|
def show_format(self, item, *args):
|
||||||
@ -576,6 +578,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
|||||||
self.orig_date = qt_to_dt(self.date.date())
|
self.orig_date = qt_to_dt(self.date.date())
|
||||||
|
|
||||||
exts = self.db.formats(row)
|
exts = self.db.formats(row)
|
||||||
|
self.original_formats = []
|
||||||
if exts:
|
if exts:
|
||||||
exts = exts.split(',')
|
exts = exts.split(',')
|
||||||
for ext in exts:
|
for ext in exts:
|
||||||
@ -586,6 +589,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
|
|||||||
if size is None:
|
if size is None:
|
||||||
continue
|
continue
|
||||||
Format(self.formats, ext, size, timestamp=timestamp)
|
Format(self.formats, ext, size, timestamp=timestamp)
|
||||||
|
self.original_formats.append(ext.lower())
|
||||||
|
|
||||||
|
|
||||||
self.initialize_combos()
|
self.initialize_combos()
|
||||||
|
@ -472,6 +472,7 @@ class FormatsManager(QWidget): # {{{
|
|||||||
def initialize(self, db, id_):
|
def initialize(self, db, id_):
|
||||||
self.changed = False
|
self.changed = False
|
||||||
exts = db.formats(id_, index_is_id=True)
|
exts = db.formats(id_, index_is_id=True)
|
||||||
|
self.original_val = set([])
|
||||||
if exts:
|
if exts:
|
||||||
exts = exts.split(',')
|
exts = exts.split(',')
|
||||||
for ext in exts:
|
for ext in exts:
|
||||||
@ -482,6 +483,7 @@ class FormatsManager(QWidget): # {{{
|
|||||||
if size is None:
|
if size is None:
|
||||||
continue
|
continue
|
||||||
Format(self.formats, ext, size, timestamp=timestamp)
|
Format(self.formats, ext, size, timestamp=timestamp)
|
||||||
|
self.original_val.add(ext.lower())
|
||||||
|
|
||||||
def commit(self, db, id_):
|
def commit(self, db, id_):
|
||||||
if not self.changed:
|
if not self.changed:
|
||||||
@ -500,11 +502,12 @@ class FormatsManager(QWidget): # {{{
|
|||||||
for ext in new_extensions:
|
for ext in new_extensions:
|
||||||
db.add_format(id_, ext, open(paths[ext], 'rb'), notify=False,
|
db.add_format(id_, ext, open(paths[ext], 'rb'), notify=False,
|
||||||
index_is_id=True)
|
index_is_id=True)
|
||||||
db_extensions = set([f.lower() for f in db.formats(id_,
|
dbfmts = db.formats(id_, index_is_id=True)
|
||||||
index_is_id=True).split(',')])
|
db_extensions = set([f.lower() for f in (dbfmts.split(',') if dbfmts
|
||||||
|
else [])])
|
||||||
extensions = new_extensions.union(old_extensions)
|
extensions = new_extensions.union(old_extensions)
|
||||||
for ext in db_extensions:
|
for ext in db_extensions:
|
||||||
if ext not in extensions:
|
if ext not in extensions and ext in self.original_val:
|
||||||
db.remove_format(id_, ext, notify=False, index_is_id=True)
|
db.remove_format(id_, ext, notify=False, index_is_id=True)
|
||||||
|
|
||||||
self.changed = False
|
self.changed = False
|
||||||
|
@ -311,10 +311,15 @@ remove all non-breaking-space entities, or may include false positive matches re
|
|||||||
|
|
||||||
:guilabel:`Ensure scene breaks are consistently formatted`
|
:guilabel:`Ensure scene breaks are consistently formatted`
|
||||||
With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.
|
With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.
|
||||||
It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
|
'Soft' scene break markers, i.e. scene breaks only defined by extra white space, are styled to ensure that they
|
||||||
page width. Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and
|
will not be displayed in conjunction with page breaks.
|
||||||
thus become difficult to distinguish.
|
|
||||||
|
|
||||||
|
:guilabel:`Replace scene breaks`
|
||||||
|
If this option is configured then |app| will replace scene break markers it finds with the replacement text specified by the
|
||||||
|
user. In general you should avoid using html tags, |app| will discard any tags and use pre-defined markup. <hr />
|
||||||
|
tags, i.e. horizontal rules, are an exception. These can optionally be specified with styles, if you choose to add your own
|
||||||
|
style be sure to include the 'width' setting, otherwise the style information will be discarded.
|
||||||
|
|
||||||
:guilabel:`Remove unnecessary hyphens`
|
:guilabel:`Remove unnecessary hyphens`
|
||||||
|app| will analyze all hyphenated content in the document when this option is enabled. The document itself is used
|
|app| will analyze all hyphenated content in the document when this option is enabled. The document itself is used
|
||||||
as a dictionary for analysis. This allows |app| to accurately remove hyphens for any words in the document in any language,
|
as a dictionary for analysis. This allows |app| to accurately remove hyphens for any words in the document in any language,
|
||||||
@ -628,7 +633,7 @@ between 0 and 1. The default is 0.45, just under the median line length. Lower t
|
|||||||
text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
|
text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
|
||||||
|
|
||||||
Also, they often have headers and footers as part of the document that will become included with the text.
|
Also, they often have headers and footers as part of the document that will become included with the text.
|
||||||
Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
|
Use the Search and Replace panel to remove headers and footers to mitigate this issue. If the headers and footers are not
|
||||||
removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read
|
removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read
|
||||||
:ref:`regexptutorial`.
|
:ref:`regexptutorial`.
|
||||||
|
|
||||||
|
@ -391,6 +391,8 @@ Take your pick:
|
|||||||
* A tribute to the SONY Librie which was the first e-ink based e-book reader
|
* A tribute to the SONY Librie which was the first e-ink based e-book reader
|
||||||
* My wife chose it ;-)
|
* My wife chose it ;-)
|
||||||
|
|
||||||
|
|app| is pronounced as cal-i-ber *not* ca-libre. If you're wondering, |app| is the British/commonwealth spelling for caliber. Being Indian, that's the natural spelling for me.
|
||||||
|
|
||||||
Why does |app| show only some of my fonts on OS X?
|
Why does |app| show only some of my fonts on OS X?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|app| embeds fonts in ebook files it creates. E-book files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts found on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory.
|
|app| embeds fonts in ebook files it creates. E-book files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts found on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user