merge from trunk

This commit is contained in:
ldolse 2011-02-07 20:17:08 +08:00
commit e77e42d1a9
9 changed files with 226 additions and 24 deletions

View File

@ -0,0 +1,55 @@
__license__ = 'GPL v3'
__author__ = 'Luis Hernandez'
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
__version__ = 'v1.0'
__date__ = '30 January 2011'
'''
www.europapress.es
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
title = u'Europa Press'
author = 'Luis Hernandez'
description = 'spanish news agency'
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
language = 'es'
timefmt = '[%a, %d %b, %Y]'
remove_tags_before = dict(name='div' , attrs={'class':['nivel1 bg_3col']})
remove_tags_after = dict(name='div' , attrs={'id':['ImprimirEnviarNoticia']})
remove_tags = [
dict(name='ul', attrs={'id':['entidadesNoticia','MenuSecciones']})
,dict(name='div', attrs={'id':['ImprimirEnviarNoticia','PublicidadSuperior','CabeceraDerecha','Comentarios','comentarios full fbConnectAPI','ComentarEstaNoticia','ctl00_Superior_Main_MasEnChance_cajamasnoticias','gl_chn','videos_portada_derecha','galeria_portada_central','galeria_portada_central_boxes']})
,dict(name='div', attrs={'class':['infoRelacionada','col_1','buscador','caja doblecolumna strong','CHANCE_EP_Encuesta_frontal text','seccionportada col_0','seccion header','text','pie caption_over']})
,dict(name='a', attrs={'class':['buscadorLabel']})
,dict(name='span', attrs={'class':['editado']})
,dict(name='table')
,dict(name='li')
]
feeds = [
(u'Portada' , u'http://www.europapress.es/rss/rss.aspx')
,(u'Nacional' , u'http://www.europapress.es/rss/rss.aspx?ch=66')
,(u'Internacional' , u'http://www.europapress.es/rss/rss.aspx?ch=69')
,(u'Economia' , u'http://www.europapress.es/rss/rss.aspx?ch=136')
,(u'Deportes' , u'http://www.europapress.es/rss/rss.aspx?ch=67')
,(u'Cultura' , u'http://www.europapress.es/rss/rss.aspx?ch=126')
,(u'Sociedad' , u'http://www.europapress.es/rss/rss.aspx?ch=73')
,(u'Motor' , u'http://www.europapress.es/rss/rss.aspx?ch=435')
,(u'CHANCE' , u'http://www.europapress.es/rss/rss.aspx?ch=549')
,(u'Comunicados' , u'http://www.europapress.es/rss/rss.aspx?ch=137')
]

View File

@ -35,7 +35,7 @@ class IrishTimes(BasicNewsRecipe):
def print_version(self, url):
if url.count('rss.feedsportal.com'):
u = 'http://www.irishtimes.com' + \
(((url[69:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html')
(((url[70:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html')
else:
u = url.replace('.html','_pf.html')
return u

View File

@ -0,0 +1,43 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1291540961(BasicNewsRecipe):
title = u'Radio Praha'
__author__ = 'Francois Pellicaan'
description = 'News and information from and about The Czech republic. '
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
encoding = 'utf8'
publisher = 'Radio Prague'
category = 'News'
language = 'en_CZ'
publication_type = 'newsportal'
extra_css = 'h1 .section { display: block; text-transform: uppercase; font-size: 10px; margin-top: 4em; } \n .title { font-size: 14px; margin-top: 4em; } \n a.photo { display: block; clear:both; } \n .caption { font-size: 9px; display: block; clear:both; padding:0px 0px 20px 0px; } \n a { font-type: normal; }'
keep_only_tags = [
dict(name='div', attrs={'class':['main']})
]
remove_tags = [
dict(name='div', attrs={'class':['cleaner', 'options', 'toolsXXL']}),
dict(name='ul', attrs={'class':['tools']})
]
feeds = [
(u'Current Affairs', 'http://www.radio.cz/feeds/rss/en/themes/curraffrs.xml'),
(u'Society', 'http://www.radio.cz/feeds/rss/en/themes/society.xml'),
(u'European Union', 'http:http://www.radio.cz/feeds/rss/en/themes/eu.xml'),
(u'Foreign policy', 'http://www.radio.cz/feeds/rss/en/themes/foreignpolicy.xml'),
(u'Business', 'http://www.radio.cz/feeds/rss/en/themes/business.xml'),
(u'Culture', 'http://www.radio.cz/feeds/rss/en/themes/culture.xml'),
(u'Czechs abroad', 'http://www.radio.cz/feeds/rss/en/themes/czechabroad.xml'),
(u'History', 'http://www.radio.cz/feeds/rss/en/themes/history.xml'),
(u'Nature', 'http://www.radio.cz/feeds/rss/en/themes/nature.xml'),
(u'Science', 'http://www.radio.cz/feeds/rss/en/themes/science.xml'),
(u'Sport', 'http://www.radio.cz/feeds/rss/en/themes/sport.xml'),
(u'Travel', 'http://www.radio.cz/feeds/rss/en/themes/travel.xml'),
]

View File

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1291540961(BasicNewsRecipe):
title = u'Radio Praha'
__author__ = 'Francois Pellicaan'
description = u'Česká oficiální mezinárodní vysílací stanice.'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
encoding = 'utf8'
publisher = u'Český rozhlas'
category = 'News'
language = 'cs'
publication_type = 'newsportal'
extra_css = u'h1 .section { display: block; text-transform: uppercase; font-size: 10px; margin-top: 4em; } \n .title { font-size: 14px; margin-top: 4em; } \n a.photo { display: block; clear:both; } \n .caption { font-size: 9px; display: block; clear:both; padding:0px 0px 20px 0px; } \n a { font-type: normal; }'
keep_only_tags = [
dict(name='div', attrs={'class':['main']})
]
remove_tags = [
dict(name='div', attrs={'class':['cleaner', 'options', 'toolsXXL']}),
dict(name='ul', attrs={'class':['tools']})
]
feeds = [
(u'Domácí politika', 'http://www.radio.cz/feeds/rss/cs/oblast/dompol.xml'),
(u'Společnost', 'http://www.radio.cz/feeds/rss/cs/oblast/spolecnost.xml'),
(u'Evropská unie', 'http://www.radio.cz/feeds/rss/cs/oblast/eu.xml'),
(u'Zahraniční politika', 'http://www.radio.cz/feeds/rss/cs/oblast/zahrpol.xml'),
(u'Ekonomika', 'http://www.radio.cz/feeds/rss/cs/oblast/ekonomika.xml'),
(u'Kultura', 'http://www.radio.cz/feeds/rss/cs/oblast/kultura.xml'),
(u'Krajané', 'http://www.radio.cz/feeds/rss/cs/oblast/krajane.xml'),
(u'Historie', 'http://www.radio.cz/feeds/rss/cs/oblast/historie.xml'),
(u'Příroda', 'http://www.radio.cz/feeds/rss/cs/oblast/priroda.xml'),
(u'Věda', 'http://www.radio.cz/feeds/rss/cs/oblast/veda.xml'),
(u'Sport', 'http://www.radio.cz/feeds/rss/cs/oblast/sport.xml'),
(u'Cestování', 'http://www.radio.cz/feeds/rss/cs/oblast/cestovani.xml'),
]

View File

@ -15,6 +15,7 @@ from calibre import guess_type, strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML
from calibre.library.comments import comments_to_html
from calibre.utils.date import is_date_undefined
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
@ -130,7 +131,10 @@ def render_jacket(mi, output_profile,
publisher = ''
try:
pubdate = strftime(u'%Y', mi.pubdate.timetuple())
if is_date_undefined(mi.pubdate):
pubdate = ''
else:
pubdate = strftime(u'%Y', mi.pubdate.timetuple())
except:
pubdate = ''
@ -175,19 +179,24 @@ def render_jacket(mi, output_profile,
soup = BeautifulSoup(generated_html)
if not series:
series_tag = soup.find(attrs={'class':'cbj_series'})
series_tag.extract()
if series_tag is not None:
series_tag.extract()
if not rating:
rating_tag = soup.find(attrs={'class':'cbj_rating'})
rating_tag.extract()
if rating_tag is not None:
rating_tag.extract()
if not tags:
tags_tag = soup.find(attrs={'class':'cbj_tags'})
tags_tag.extract()
if tags_tag is not None:
tags_tag.extract()
if not pubdate:
pubdate_tag = soup.find(attrs={'class':'cbj_pubdate'})
pubdate_tag.extract()
pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
if pubdate_tag is not None:
pubdate_tag.extract()
if output_profile.short_name != 'kindle':
hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
hr_tag.extract()
if hr_tag is not None:
hr_tag.extract()
return soup.renderContents(None)

View File

@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@ -99,14 +99,6 @@ class TXTInput(InputFormatPlugin):
setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False)
if options.txt_in_remove_indents:
txt = remove_indents(txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the &nbsp; entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
# Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
@ -114,6 +106,7 @@ class TXTInput(InputFormatPlugin):
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import HeuristicProcessor
# unwrap lines based on punctuation
@ -122,6 +115,8 @@ class TXTInput(InputFormatPlugin):
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt)
else:
txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
docanalysis = DocAnalysis('txt', txt)
@ -130,6 +125,15 @@ class TXTInput(InputFormatPlugin):
dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length)
# User requested transformation on the text.
if options.txt_in_remove_indents:
txt = remove_indents(txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the &nbsp; entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
# Process the text using the appropriate text processor.
html = ''
if options.formatting_type == 'markdown':

View File

@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def clean_txt(txt):
'''
Run transformations on the text to put it into
consistent state.
'''
if isbytestring(txt):
txt = txt.decode('utf-8', 'replace')
# Strip whitespace from the end of the line. Also replace
@ -42,6 +46,15 @@ def clean_txt(txt):
return txt
def split_txt(txt, epub_split_size_kb=0):
'''
Ensure there are split points for converting
to EPUB. A misdetected paragraph type can
result in the entire document being one giant
paragraph. In this case the EPUB parser will not
be able to determine where to split the file
to accomidate the EPUB file size limitation
and will fail.
'''
#Takes care if there is no point to split
if epub_split_size_kb > 0:
if isinstance(txt, unicode):
@ -59,6 +72,12 @@ def split_txt(txt, epub_split_size_kb=0):
return txt
def convert_basic(txt, title='', epub_split_size_kb=0):
'''
Converts plain text to html by putting all paragraphs in
<p> tags. It condense and retains blank lines when necessary.
Requires paragraphs to be in single line format.
'''
txt = clean_txt(txt)
txt = split_txt(txt, epub_split_size_kb)
@ -99,15 +118,25 @@ def separate_paragraphs_single_line(txt):
return txt
def separate_paragraphs_print_formatted(txt):
txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
return txt
def block_to_single_line(txt):
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
return txt
def preserve_spaces(txt):
'''
Replaces spaces multiple spaces with &nbsp; entities.
'''
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + ('&nbsp;' * (len(mo.group('space')) - 1)), txt)
txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
return txt
def remove_indents(txt):
'''
Remove whitespace at the beginning of each line.
'''
txt = re.sub('(?miu)^\s+', '', txt)
return txt
@ -118,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
with open(os.path.join(path, opf_name), 'wb') as opffile:
opf.render(opffile)
def split_string_separator(txt, size) :
def split_string_separator(txt, size):
'''
Splits the text by putting \n\n at the point size.
'''
if len(txt) > size:
txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
txt[i:i+size], 1) for i in
@ -127,7 +159,7 @@ def split_string_separator(txt, size) :
def detect_paragraph_type(txt):
'''
Tries to determine the formatting of the document.
Tries to determine the paragraph type of the document.
block: Paragraphs are separated by a blank line.
single: Each line is a paragraph.
@ -170,6 +202,16 @@ def detect_paragraph_type(txt):
def detect_formatting_type(txt):
'''
Tries to determine the formatting of the document.
markdown: Markdown formatting is used.
textile: Textile formatting is used.
heuristic: When none of the above formatting types are
detected heuristic is returned.
'''
# Keep a count of the number of format specific object
# that are found in the text.
markdown_count = 0
textile_count = 0
@ -193,6 +235,8 @@ def detect_formatting_type(txt):
# Links
textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
# Decide if either markdown or textile is used in the text
# based on the number of unique formatting elements found.
if markdown_count > 5 or textile_count > 5:
if markdown_count > textile_count:
return 'markdown'

View File

@ -89,7 +89,8 @@ class MessageBox(QDialog, Ui_Dialog):
(__version__, unicode(self.windowTitle()),
unicode(self.msg.text()),
unicode(self.det_msg.toPlainText())))
self.ctc_button.setText(_('Copied'))
if hasattr(self, 'ctc_button'):
self.ctc_button.setText(_('Copied'))
def showEvent(self, ev):
ret = QDialog.showEvent(self, ev)

View File

@ -414,7 +414,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
row = self.data._data[index] if index_is_id else self.data[index]
return row[self.FIELD_MAP['path']].replace('/', os.sep)
def abspath(self, index, index_is_id=False, create_dirs=True):
'Return the absolute path to the directory containing this books files as a unicode string.'
path = os.path.join(self.library_path, self.path(index, index_is_id=index_is_id))
@ -422,7 +421,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
os.makedirs(path)
return path
def construct_path_name(self, id):
'''
Construct the directory name for this book based on its metadata.
@ -432,7 +430,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
authors = _('Unknown')
author = ascii_filename(authors.split(',')[0])[:self.PATH_LIMIT].decode(filesystem_encoding, 'replace')
title = ascii_filename(self.title(id, index_is_id=True))[:self.PATH_LIMIT].decode(filesystem_encoding, 'replace')
path = author + '/' + title + ' (%d)'%id
while author[-1] in (' ', '.'):
author = author[:-1]
if not author:
author = ascii_filename(_('Unknown')).decode(filesystem_encoding, 'replace')
path = author + '/' + title + ' (%d)'%id
return path
def construct_file_name(self, id):