mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
merge from trunk
This commit is contained in:
commit
e77e42d1a9
55
resources/recipes/europa_press.recipe
Normal file
55
resources/recipes/europa_press.recipe
Normal file
@ -0,0 +1,55 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Luis Hernandez'
|
||||
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||
__version__ = 'v1.0'
|
||||
__date__ = '30 January 2011'
|
||||
|
||||
'''
|
||||
www.europapress.es
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||
|
||||
title = u'Europa Press'
|
||||
author = 'Luis Hernandez'
|
||||
description = 'spanish news agency'
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
remove_tags_before = dict(name='div' , attrs={'class':['nivel1 bg_3col']})
|
||||
remove_tags_after = dict(name='div' , attrs={'id':['ImprimirEnviarNoticia']})
|
||||
|
||||
remove_tags = [
|
||||
dict(name='ul', attrs={'id':['entidadesNoticia','MenuSecciones']})
|
||||
,dict(name='div', attrs={'id':['ImprimirEnviarNoticia','PublicidadSuperior','CabeceraDerecha','Comentarios','comentarios full fbConnectAPI','ComentarEstaNoticia','ctl00_Superior_Main_MasEnChance_cajamasnoticias','gl_chn','videos_portada_derecha','galeria_portada_central','galeria_portada_central_boxes']})
|
||||
,dict(name='div', attrs={'class':['infoRelacionada','col_1','buscador','caja doblecolumna strong','CHANCE_EP_Encuesta_frontal text','seccionportada col_0','seccion header','text','pie caption_over']})
|
||||
,dict(name='a', attrs={'class':['buscadorLabel']})
|
||||
,dict(name='span', attrs={'class':['editado']})
|
||||
,dict(name='table')
|
||||
,dict(name='li')
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Portada' , u'http://www.europapress.es/rss/rss.aspx')
|
||||
,(u'Nacional' , u'http://www.europapress.es/rss/rss.aspx?ch=66')
|
||||
,(u'Internacional' , u'http://www.europapress.es/rss/rss.aspx?ch=69')
|
||||
,(u'Economia' , u'http://www.europapress.es/rss/rss.aspx?ch=136')
|
||||
,(u'Deportes' , u'http://www.europapress.es/rss/rss.aspx?ch=67')
|
||||
,(u'Cultura' , u'http://www.europapress.es/rss/rss.aspx?ch=126')
|
||||
,(u'Sociedad' , u'http://www.europapress.es/rss/rss.aspx?ch=73')
|
||||
,(u'Motor' , u'http://www.europapress.es/rss/rss.aspx?ch=435')
|
||||
,(u'CHANCE' , u'http://www.europapress.es/rss/rss.aspx?ch=549')
|
||||
,(u'Comunicados' , u'http://www.europapress.es/rss/rss.aspx?ch=137')
|
||||
]
|
||||
|
@ -35,7 +35,7 @@ class IrishTimes(BasicNewsRecipe):
|
||||
def print_version(self, url):
|
||||
if url.count('rss.feedsportal.com'):
|
||||
u = 'http://www.irishtimes.com' + \
|
||||
(((url[69:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html')
|
||||
(((url[70:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html')
|
||||
else:
|
||||
u = url.replace('.html','_pf.html')
|
||||
return u
|
||||
|
43
resources/recipes/radio_prague.recipe
Normal file
43
resources/recipes/radio_prague.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1291540961(BasicNewsRecipe):
|
||||
|
||||
title = u'Radio Praha'
|
||||
__author__ = 'Francois Pellicaan'
|
||||
description = 'News and information from and about The Czech republic. '
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
encoding = 'utf8'
|
||||
publisher = 'Radio Prague'
|
||||
category = 'News'
|
||||
language = 'en_CZ'
|
||||
publication_type = 'newsportal'
|
||||
|
||||
extra_css = 'h1 .section { display: block; text-transform: uppercase; font-size: 10px; margin-top: 4em; } \n .title { font-size: 14px; margin-top: 4em; } \n a.photo { display: block; clear:both; } \n .caption { font-size: 9px; display: block; clear:both; padding:0px 0px 20px 0px; } \n a { font-type: normal; }'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['main']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['cleaner', 'options', 'toolsXXL']}),
|
||||
dict(name='ul', attrs={'class':['tools']})
|
||||
]
|
||||
feeds = [
|
||||
(u'Current Affairs', 'http://www.radio.cz/feeds/rss/en/themes/curraffrs.xml'),
|
||||
(u'Society', 'http://www.radio.cz/feeds/rss/en/themes/society.xml'),
|
||||
(u'European Union', 'http:http://www.radio.cz/feeds/rss/en/themes/eu.xml'),
|
||||
(u'Foreign policy', 'http://www.radio.cz/feeds/rss/en/themes/foreignpolicy.xml'),
|
||||
(u'Business', 'http://www.radio.cz/feeds/rss/en/themes/business.xml'),
|
||||
(u'Culture', 'http://www.radio.cz/feeds/rss/en/themes/culture.xml'),
|
||||
(u'Czechs abroad', 'http://www.radio.cz/feeds/rss/en/themes/czechabroad.xml'),
|
||||
(u'History', 'http://www.radio.cz/feeds/rss/en/themes/history.xml'),
|
||||
(u'Nature', 'http://www.radio.cz/feeds/rss/en/themes/nature.xml'),
|
||||
(u'Science', 'http://www.radio.cz/feeds/rss/en/themes/science.xml'),
|
||||
(u'Sport', 'http://www.radio.cz/feeds/rss/en/themes/sport.xml'),
|
||||
(u'Travel', 'http://www.radio.cz/feeds/rss/en/themes/travel.xml'),
|
||||
]
|
44
resources/recipes/radio_praha.recipe
Normal file
44
resources/recipes/radio_praha.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1291540961(BasicNewsRecipe):
|
||||
|
||||
title = u'Radio Praha'
|
||||
__author__ = 'Francois Pellicaan'
|
||||
description = u'Česká oficiální mezinárodní vysílací stanice.'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
encoding = 'utf8'
|
||||
publisher = u'Český rozhlas'
|
||||
category = 'News'
|
||||
language = 'cs'
|
||||
publication_type = 'newsportal'
|
||||
|
||||
extra_css = u'h1 .section { display: block; text-transform: uppercase; font-size: 10px; margin-top: 4em; } \n .title { font-size: 14px; margin-top: 4em; } \n a.photo { display: block; clear:both; } \n .caption { font-size: 9px; display: block; clear:both; padding:0px 0px 20px 0px; } \n a { font-type: normal; }'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['main']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['cleaner', 'options', 'toolsXXL']}),
|
||||
dict(name='ul', attrs={'class':['tools']})
|
||||
]
|
||||
feeds = [
|
||||
(u'Domácí politika', 'http://www.radio.cz/feeds/rss/cs/oblast/dompol.xml'),
|
||||
(u'Společnost', 'http://www.radio.cz/feeds/rss/cs/oblast/spolecnost.xml'),
|
||||
(u'Evropská unie', 'http://www.radio.cz/feeds/rss/cs/oblast/eu.xml'),
|
||||
(u'Zahraniční politika', 'http://www.radio.cz/feeds/rss/cs/oblast/zahrpol.xml'),
|
||||
(u'Ekonomika', 'http://www.radio.cz/feeds/rss/cs/oblast/ekonomika.xml'),
|
||||
(u'Kultura', 'http://www.radio.cz/feeds/rss/cs/oblast/kultura.xml'),
|
||||
(u'Krajané', 'http://www.radio.cz/feeds/rss/cs/oblast/krajane.xml'),
|
||||
(u'Historie', 'http://www.radio.cz/feeds/rss/cs/oblast/historie.xml'),
|
||||
(u'Příroda', 'http://www.radio.cz/feeds/rss/cs/oblast/priroda.xml'),
|
||||
(u'Věda', 'http://www.radio.cz/feeds/rss/cs/oblast/veda.xml'),
|
||||
(u'Sport', 'http://www.radio.cz/feeds/rss/cs/oblast/sport.xml'),
|
||||
(u'Cestování', 'http://www.radio.cz/feeds/rss/cs/oblast/cestovani.xml'),
|
||||
]
|
@ -15,6 +15,7 @@ from calibre import guess_type, strftime
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML
|
||||
from calibre.library.comments import comments_to_html
|
||||
from calibre.utils.date import is_date_undefined
|
||||
|
||||
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
|
||||
|
||||
@ -130,6 +131,9 @@ def render_jacket(mi, output_profile,
|
||||
publisher = ''
|
||||
|
||||
try:
|
||||
if is_date_undefined(mi.pubdate):
|
||||
pubdate = ''
|
||||
else:
|
||||
pubdate = strftime(u'%Y', mi.pubdate.timetuple())
|
||||
except:
|
||||
pubdate = ''
|
||||
@ -175,18 +179,23 @@ def render_jacket(mi, output_profile,
|
||||
soup = BeautifulSoup(generated_html)
|
||||
if not series:
|
||||
series_tag = soup.find(attrs={'class':'cbj_series'})
|
||||
if series_tag is not None:
|
||||
series_tag.extract()
|
||||
if not rating:
|
||||
rating_tag = soup.find(attrs={'class':'cbj_rating'})
|
||||
if rating_tag is not None:
|
||||
rating_tag.extract()
|
||||
if not tags:
|
||||
tags_tag = soup.find(attrs={'class':'cbj_tags'})
|
||||
if tags_tag is not None:
|
||||
tags_tag.extract()
|
||||
if not pubdate:
|
||||
pubdate_tag = soup.find(attrs={'class':'cbj_pubdate'})
|
||||
pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
|
||||
if pubdate_tag is not None:
|
||||
pubdate_tag.extract()
|
||||
if output_profile.short_name != 'kindle':
|
||||
hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
|
||||
if hr_tag is not None:
|
||||
hr_tag.extract()
|
||||
|
||||
return soup.renderContents(None)
|
||||
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||
normalize_line_endings, convert_textile, remove_indents
|
||||
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
|
||||
from calibre import _ent_pat, xml_entity_to_unicode
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
@ -99,14 +99,6 @@ class TXTInput(InputFormatPlugin):
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'unwrap_lines', False)
|
||||
|
||||
if options.txt_in_remove_indents:
|
||||
txt = remove_indents(txt)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Reformat paragraphs to block formatting based on the detected type.
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
@ -114,6 +106,7 @@ class TXTInput(InputFormatPlugin):
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'print':
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
elif options.paragraph_type == 'unformatted':
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
# unwrap lines based on punctuation
|
||||
@ -122,6 +115,8 @@ class TXTInput(InputFormatPlugin):
|
||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
else:
|
||||
txt = block_to_single_line(txt)
|
||||
|
||||
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
@ -130,6 +125,15 @@ class TXTInput(InputFormatPlugin):
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
# User requested transformation on the text.
|
||||
if options.txt_in_remove_indents:
|
||||
txt = remove_indents(txt)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Process the text using the appropriate text processor.
|
||||
html = ''
|
||||
if options.formatting_type == 'markdown':
|
||||
|
@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
|
||||
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
||||
|
||||
def clean_txt(txt):
|
||||
'''
|
||||
Run transformations on the text to put it into
|
||||
consistent state.
|
||||
'''
|
||||
if isbytestring(txt):
|
||||
txt = txt.decode('utf-8', 'replace')
|
||||
# Strip whitespace from the end of the line. Also replace
|
||||
@ -42,6 +46,15 @@ def clean_txt(txt):
|
||||
return txt
|
||||
|
||||
def split_txt(txt, epub_split_size_kb=0):
|
||||
'''
|
||||
Ensure there are split points for converting
|
||||
to EPUB. A misdetected paragraph type can
|
||||
result in the entire document being one giant
|
||||
paragraph. In this case the EPUB parser will not
|
||||
be able to determine where to split the file
|
||||
to accomidate the EPUB file size limitation
|
||||
and will fail.
|
||||
'''
|
||||
#Takes care if there is no point to split
|
||||
if epub_split_size_kb > 0:
|
||||
if isinstance(txt, unicode):
|
||||
@ -59,6 +72,12 @@ def split_txt(txt, epub_split_size_kb=0):
|
||||
return txt
|
||||
|
||||
def convert_basic(txt, title='', epub_split_size_kb=0):
|
||||
'''
|
||||
Converts plain text to html by putting all paragraphs in
|
||||
<p> tags. It condense and retains blank lines when necessary.
|
||||
|
||||
Requires paragraphs to be in single line format.
|
||||
'''
|
||||
txt = clean_txt(txt)
|
||||
txt = split_txt(txt, epub_split_size_kb)
|
||||
|
||||
@ -99,15 +118,25 @@ def separate_paragraphs_single_line(txt):
|
||||
return txt
|
||||
|
||||
def separate_paragraphs_print_formatted(txt):
|
||||
txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
|
||||
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
|
||||
return txt
|
||||
|
||||
def block_to_single_line(txt):
|
||||
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
|
||||
return txt
|
||||
|
||||
def preserve_spaces(txt):
|
||||
'''
|
||||
Replaces spaces multiple spaces with entities.
|
||||
'''
|
||||
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
||||
txt = txt.replace('\t', ' ')
|
||||
return txt
|
||||
|
||||
def remove_indents(txt):
|
||||
'''
|
||||
Remove whitespace at the beginning of each line.
|
||||
'''
|
||||
txt = re.sub('(?miu)^\s+', '', txt)
|
||||
return txt
|
||||
|
||||
@ -119,6 +148,9 @@ def opf_writer(path, opf_name, manifest, spine, mi):
|
||||
opf.render(opffile)
|
||||
|
||||
def split_string_separator(txt, size):
|
||||
'''
|
||||
Splits the text by putting \n\n at the point size.
|
||||
'''
|
||||
if len(txt) > size:
|
||||
txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
|
||||
txt[i:i+size], 1) for i in
|
||||
@ -127,7 +159,7 @@ def split_string_separator(txt, size) :
|
||||
|
||||
def detect_paragraph_type(txt):
|
||||
'''
|
||||
Tries to determine the formatting of the document.
|
||||
Tries to determine the paragraph type of the document.
|
||||
|
||||
block: Paragraphs are separated by a blank line.
|
||||
single: Each line is a paragraph.
|
||||
@ -170,6 +202,16 @@ def detect_paragraph_type(txt):
|
||||
|
||||
|
||||
def detect_formatting_type(txt):
|
||||
'''
|
||||
Tries to determine the formatting of the document.
|
||||
|
||||
markdown: Markdown formatting is used.
|
||||
textile: Textile formatting is used.
|
||||
heuristic: When none of the above formatting types are
|
||||
detected heuristic is returned.
|
||||
'''
|
||||
# Keep a count of the number of format specific object
|
||||
# that are found in the text.
|
||||
markdown_count = 0
|
||||
textile_count = 0
|
||||
|
||||
@ -193,6 +235,8 @@ def detect_formatting_type(txt):
|
||||
# Links
|
||||
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
|
||||
|
||||
# Decide if either markdown or textile is used in the text
|
||||
# based on the number of unique formatting elements found.
|
||||
if markdown_count > 5 or textile_count > 5:
|
||||
if markdown_count > textile_count:
|
||||
return 'markdown'
|
||||
|
@ -89,6 +89,7 @@ class MessageBox(QDialog, Ui_Dialog):
|
||||
(__version__, unicode(self.windowTitle()),
|
||||
unicode(self.msg.text()),
|
||||
unicode(self.det_msg.toPlainText())))
|
||||
if hasattr(self, 'ctc_button'):
|
||||
self.ctc_button.setText(_('Copied'))
|
||||
|
||||
def showEvent(self, ev):
|
||||
|
@ -414,7 +414,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
||||
row = self.data._data[index] if index_is_id else self.data[index]
|
||||
return row[self.FIELD_MAP['path']].replace('/', os.sep)
|
||||
|
||||
|
||||
def abspath(self, index, index_is_id=False, create_dirs=True):
|
||||
'Return the absolute path to the directory containing this books files as a unicode string.'
|
||||
path = os.path.join(self.library_path, self.path(index, index_is_id=index_is_id))
|
||||
@ -422,7 +421,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
||||
os.makedirs(path)
|
||||
return path
|
||||
|
||||
|
||||
def construct_path_name(self, id):
|
||||
'''
|
||||
Construct the directory name for this book based on its metadata.
|
||||
@ -432,6 +430,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
|
||||
authors = _('Unknown')
|
||||
author = ascii_filename(authors.split(',')[0])[:self.PATH_LIMIT].decode(filesystem_encoding, 'replace')
|
||||
title = ascii_filename(self.title(id, index_is_id=True))[:self.PATH_LIMIT].decode(filesystem_encoding, 'replace')
|
||||
while author[-1] in (' ', '.'):
|
||||
author = author[:-1]
|
||||
if not author:
|
||||
author = ascii_filename(_('Unknown')).decode(filesystem_encoding, 'replace')
|
||||
path = author + '/' + title + ' (%d)'%id
|
||||
return path
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user