diff --git a/resources/recipes/europa_press.recipe b/resources/recipes/europa_press.recipe new file mode 100644 index 0000000000..ace0f8b6d1 --- /dev/null +++ b/resources/recipes/europa_press.recipe @@ -0,0 +1,55 @@ +__license__ = 'GPL v3' +__author__ = 'Luis Hernandez' +__copyright__ = 'Luis Hernandez' +__version__ = 'v1.0' +__date__ = '30 January 2011' + +''' +www.europapress.es +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1294946868(BasicNewsRecipe): + + title = u'Europa Press' + author = 'Luis Hernandez' + description = 'spanish news agency' + + oldest_article = 2 + max_articles_per_feed = 100 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + + language = 'es' + timefmt = '[%a, %d %b, %Y]' + + remove_tags_before = dict(name='div' , attrs={'class':['nivel1 bg_3col']}) + remove_tags_after = dict(name='div' , attrs={'id':['ImprimirEnviarNoticia']}) + + remove_tags = [ + dict(name='ul', attrs={'id':['entidadesNoticia','MenuSecciones']}) + ,dict(name='div', attrs={'id':['ImprimirEnviarNoticia','PublicidadSuperior','CabeceraDerecha','Comentarios','comentarios full fbConnectAPI','ComentarEstaNoticia','ctl00_Superior_Main_MasEnChance_cajamasnoticias','gl_chn','videos_portada_derecha','galeria_portada_central','galeria_portada_central_boxes']}) + ,dict(name='div', attrs={'class':['infoRelacionada','col_1','buscador','caja doblecolumna strong','CHANCE_EP_Encuesta_frontal text','seccionportada col_0','seccion header','text','pie caption_over']}) + ,dict(name='a', attrs={'class':['buscadorLabel']}) + ,dict(name='span', attrs={'class':['editado']}) + ,dict(name='table') + ,dict(name='li') + ] + + + feeds = [ + (u'Portada' , u'http://www.europapress.es/rss/rss.aspx') + ,(u'Nacional' , u'http://www.europapress.es/rss/rss.aspx?ch=66') + ,(u'Internacional' , u'http://www.europapress.es/rss/rss.aspx?ch=69') + ,(u'Economia' , u'http://www.europapress.es/rss/rss.aspx?ch=136') + ,(u'Deportes' , u'http://www.europapress.es/rss/rss.aspx?ch=67') + ,(u'Cultura' , u'http://www.europapress.es/rss/rss.aspx?ch=126') + ,(u'Sociedad' , u'http://www.europapress.es/rss/rss.aspx?ch=73') + ,(u'Motor' , u'http://www.europapress.es/rss/rss.aspx?ch=435') + ,(u'CHANCE' , u'http://www.europapress.es/rss/rss.aspx?ch=549') + ,(u'Comunicados' , u'http://www.europapress.es/rss/rss.aspx?ch=137') + ] + diff --git a/resources/recipes/irish_times.recipe b/resources/recipes/irish_times.recipe index 0ac130ed7a..83ea496b2c 100644 --- a/resources/recipes/irish_times.recipe +++ b/resources/recipes/irish_times.recipe @@ -35,7 +35,7 @@ class IrishTimes(BasicNewsRecipe): def print_version(self, url): if url.count('rss.feedsportal.com'): u = 'http://www.irishtimes.com' + \ - (((url[69:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html') + (((url[70:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html') else: u = url.replace('.html','_pf.html') return u diff --git a/resources/recipes/radio_prague.recipe b/resources/recipes/radio_prague.recipe new file mode 100644 index 0000000000..2e228e06a9 --- /dev/null +++ b/resources/recipes/radio_prague.recipe @@ -0,0 +1,43 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1291540961(BasicNewsRecipe): + + title = u'Radio Praha' + __author__ = 'Francois Pellicaan' + description = 'News and information from and about The Czech republic. ' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + encoding = 'utf8' + publisher = 'Radio Prague' + category = 'News' + language = 'en_CZ' + publication_type = 'newsportal' + + extra_css = 'h1 .section { display: block; text-transform: uppercase; font-size: 10px; margin-top: 4em; } \n .title { font-size: 14px; margin-top: 4em; } \n a.photo { display: block; clear:both; } \n .caption { font-size: 9px; display: block; clear:both; padding:0px 0px 20px 0px; } \n a { font-type: normal; }' + + + keep_only_tags = [ + dict(name='div', attrs={'class':['main']}) + ] + remove_tags = [ + dict(name='div', attrs={'class':['cleaner', 'options', 'toolsXXL']}), + dict(name='ul', attrs={'class':['tools']}) + ] + feeds = [ + (u'Current Affairs', 'http://www.radio.cz/feeds/rss/en/themes/curraffrs.xml'), + (u'Society', 'http://www.radio.cz/feeds/rss/en/themes/society.xml'), + (u'European Union', 'http:http://www.radio.cz/feeds/rss/en/themes/eu.xml'), + (u'Foreign policy', 'http://www.radio.cz/feeds/rss/en/themes/foreignpolicy.xml'), + (u'Business', 'http://www.radio.cz/feeds/rss/en/themes/business.xml'), + (u'Culture', 'http://www.radio.cz/feeds/rss/en/themes/culture.xml'), + (u'Czechs abroad', 'http://www.radio.cz/feeds/rss/en/themes/czechabroad.xml'), + (u'History', 'http://www.radio.cz/feeds/rss/en/themes/history.xml'), + (u'Nature', 'http://www.radio.cz/feeds/rss/en/themes/nature.xml'), + (u'Science', 'http://www.radio.cz/feeds/rss/en/themes/science.xml'), + (u'Sport', 'http://www.radio.cz/feeds/rss/en/themes/sport.xml'), + (u'Travel', 'http://www.radio.cz/feeds/rss/en/themes/travel.xml'), + ] diff --git a/resources/recipes/radio_praha.recipe b/resources/recipes/radio_praha.recipe new file mode 100644 index 0000000000..9f14a55e40 --- /dev/null +++ b/resources/recipes/radio_praha.recipe @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1291540961(BasicNewsRecipe): + + title = u'Radio Praha' + __author__ = 'Francois Pellicaan' + description = u'Česká oficiální mezinárodní vysílací stanice.' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + encoding = 'utf8' + publisher = u'Český rozhlas' + category = 'News' + language = 'cs' + publication_type = 'newsportal' + + extra_css = u'h1 .section { display: block; text-transform: uppercase; font-size: 10px; margin-top: 4em; } \n .title { font-size: 14px; margin-top: 4em; } \n a.photo { display: block; clear:both; } \n .caption { font-size: 9px; display: block; clear:both; padding:0px 0px 20px 0px; } \n a { font-type: normal; }' + + + keep_only_tags = [ + dict(name='div', attrs={'class':['main']}) + ] + remove_tags = [ + dict(name='div', attrs={'class':['cleaner', 'options', 'toolsXXL']}), + dict(name='ul', attrs={'class':['tools']}) + ] + feeds = [ + (u'Domácí politika', 'http://www.radio.cz/feeds/rss/cs/oblast/dompol.xml'), + (u'Společnost', 'http://www.radio.cz/feeds/rss/cs/oblast/spolecnost.xml'), + (u'Evropská unie', 'http://www.radio.cz/feeds/rss/cs/oblast/eu.xml'), + (u'Zahraniční politika', 'http://www.radio.cz/feeds/rss/cs/oblast/zahrpol.xml'), + (u'Ekonomika', 'http://www.radio.cz/feeds/rss/cs/oblast/ekonomika.xml'), + (u'Kultura', 'http://www.radio.cz/feeds/rss/cs/oblast/kultura.xml'), + (u'Krajané', 'http://www.radio.cz/feeds/rss/cs/oblast/krajane.xml'), + (u'Historie', 'http://www.radio.cz/feeds/rss/cs/oblast/historie.xml'), + (u'Příroda', 'http://www.radio.cz/feeds/rss/cs/oblast/priroda.xml'), + (u'Věda', 'http://www.radio.cz/feeds/rss/cs/oblast/veda.xml'), + (u'Sport', 'http://www.radio.cz/feeds/rss/cs/oblast/sport.xml'), + (u'Cestování', 'http://www.radio.cz/feeds/rss/cs/oblast/cestovani.xml'), + ] diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py index fe0d60de7a..d3b66d1e81 100644 --- a/src/calibre/ebooks/oeb/transforms/jacket.py +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -15,6 +15,7 @@ from calibre import guess_type, strftime from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML from calibre.library.comments import comments_to_html +from calibre.utils.date import is_date_undefined JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]' @@ -130,7 +131,10 @@ def render_jacket(mi, output_profile, publisher = '' try: - pubdate = strftime(u'%Y', mi.pubdate.timetuple()) + if is_date_undefined(mi.pubdate): + pubdate = '' + else: + pubdate = strftime(u'%Y', mi.pubdate.timetuple()) except: pubdate = '' @@ -175,19 +179,24 @@ def render_jacket(mi, output_profile, soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) - series_tag.extract() + if series_tag is not None: + series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) - rating_tag.extract() + if rating_tag is not None: + rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) - tags_tag.extract() + if tags_tag is not None: + tags_tag.extract() if not pubdate: - pubdate_tag = soup.find(attrs={'class':'cbj_pubdate'}) - pubdate_tag.extract() + pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) + if pubdate_tag is not None: + pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) - hr_tag.extract() + if hr_tag is not None: + hr_tag.extract() return soup.renderContents(None) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index dc624519bb..e240205222 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - normalize_line_endings, convert_textile, remove_indents + normalize_line_endings, convert_textile, remove_indents, block_to_single_line from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -99,14 +99,6 @@ class TXTInput(InputFormatPlugin): setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) - if options.txt_in_remove_indents: - txt = remove_indents(txt) - - # Preserve spaces will replace multiple spaces to a space - # followed by the   entity. - if options.preserve_spaces: - txt = preserve_spaces(txt) - # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. @@ -114,6 +106,7 @@ class TXTInput(InputFormatPlugin): txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) + txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation @@ -122,6 +115,8 @@ class TXTInput(InputFormatPlugin): preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) + else: + txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) @@ -130,6 +125,15 @@ class TXTInput(InputFormatPlugin): dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) + # User requested transformation on the text. + if options.txt_in_remove_indents: + txt = remove_indents(txt) + + # Preserve spaces will replace multiple spaces to a space + # followed by the   entity. + if options.preserve_spaces: + txt = preserve_spaces(txt) + # Process the text using the appropriate text processor. html = '' if options.formatting_type == 'markdown': diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 987d7cdc73..f7b6cce234 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars HTML_TEMPLATE = u'%s\n%s\n' def clean_txt(txt): + ''' + Run transformations on the text to put it into + consistent state. + ''' if isbytestring(txt): txt = txt.decode('utf-8', 'replace') # Strip whitespace from the end of the line. Also replace @@ -42,6 +46,15 @@ def clean_txt(txt): return txt def split_txt(txt, epub_split_size_kb=0): + ''' + Ensure there are split points for converting + to EPUB. A misdetected paragraph type can + result in the entire document being one giant + paragraph. In this case the EPUB parser will not + be able to determine where to split the file + to accomidate the EPUB file size limitation + and will fail. + ''' #Takes care if there is no point to split if epub_split_size_kb > 0: if isinstance(txt, unicode): @@ -59,6 +72,12 @@ def split_txt(txt, epub_split_size_kb=0): return txt def convert_basic(txt, title='', epub_split_size_kb=0): + ''' + Converts plain text to html by putting all paragraphs in +

tags. It condense and retains blank lines when necessary. + + Requires paragraphs to be in single line format. + ''' txt = clean_txt(txt) txt = split_txt(txt, epub_split_size_kb) @@ -99,15 +118,25 @@ def separate_paragraphs_single_line(txt): return txt def separate_paragraphs_print_formatted(txt): - txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt) + txt = re.sub(u'(?miu)^(?P\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) + return txt + +def block_to_single_line(txt): + txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt) return txt def preserve_spaces(txt): + ''' + Replaces spaces multiple spaces with   entities. + ''' txt = re.sub('(?P[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt) txt = txt.replace('\t', '    ') return txt def remove_indents(txt): + ''' + Remove whitespace at the beginning of each line. + ''' txt = re.sub('(?miu)^\s+', '', txt) return txt @@ -118,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi): with open(os.path.join(path, opf_name), 'wb') as opffile: opf.render(opffile) -def split_string_separator(txt, size) : +def split_string_separator(txt, size): + ''' + Splits the text by putting \n\n at the point size. + ''' if len(txt) > size: txt = ''.join([re.sub(u'\.(?P[^.]*)$', '.\n\n\g', txt[i:i+size], 1) for i in @@ -127,7 +159,7 @@ def split_string_separator(txt, size) : def detect_paragraph_type(txt): ''' - Tries to determine the formatting of the document. + Tries to determine the paragraph type of the document. block: Paragraphs are separated by a blank line. single: Each line is a paragraph. @@ -170,6 +202,16 @@ def detect_paragraph_type(txt): def detect_formatting_type(txt): + ''' + Tries to determine the formatting of the document. + + markdown: Markdown formatting is used. + textile: Textile formatting is used. + heuristic: When none of the above formatting types are + detected heuristic is returned. + ''' + # Keep a count of the number of format specific object + # that are found in the text. markdown_count = 0 textile_count = 0 @@ -193,6 +235,8 @@ def detect_formatting_type(txt): # Links textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt)) + # Decide if either markdown or textile is used in the text + # based on the number of unique formatting elements found. if markdown_count > 5 or textile_count > 5: if markdown_count > textile_count: return 'markdown' diff --git a/src/calibre/gui2/dialogs/message_box.py b/src/calibre/gui2/dialogs/message_box.py index 9d586ce28d..945d50de4e 100644 --- a/src/calibre/gui2/dialogs/message_box.py +++ b/src/calibre/gui2/dialogs/message_box.py @@ -89,7 +89,8 @@ class MessageBox(QDialog, Ui_Dialog): (__version__, unicode(self.windowTitle()), unicode(self.msg.text()), unicode(self.det_msg.toPlainText()))) - self.ctc_button.setText(_('Copied')) + if hasattr(self, 'ctc_button'): + self.ctc_button.setText(_('Copied')) def showEvent(self, ev): ret = QDialog.showEvent(self, ev) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 792081732c..5702b75317 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -414,7 +414,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): row = self.data._data[index] if index_is_id else self.data[index] return row[self.FIELD_MAP['path']].replace('/', os.sep) - def abspath(self, index, index_is_id=False, create_dirs=True): 'Return the absolute path to the directory containing this books files as a unicode string.' path = os.path.join(self.library_path, self.path(index, index_is_id=index_is_id)) @@ -422,7 +421,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): os.makedirs(path) return path - def construct_path_name(self, id): ''' Construct the directory name for this book based on its metadata. @@ -432,7 +430,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): authors = _('Unknown') author = ascii_filename(authors.split(',')[0])[:self.PATH_LIMIT].decode(filesystem_encoding, 'replace') title = ascii_filename(self.title(id, index_is_id=True))[:self.PATH_LIMIT].decode(filesystem_encoding, 'replace') - path = author + '/' + title + ' (%d)'%id + while author[-1] in (' ', '.'): + author = author[:-1] + if not author: + author = ascii_filename(_('Unknown')).decode(filesystem_encoding, 'replace') + path = author + '/' + title + ' (%d)'%id return path def construct_file_name(self, id):