diff --git a/resources/images/news/exiled.png b/resources/images/news/exiled.png new file mode 100644 index 0000000000..c233aaf132 Binary files /dev/null and b/resources/images/news/exiled.png differ diff --git a/resources/recipes/deia.recipe b/resources/recipes/deia.recipe index 980d59d3d1..5d39be9a10 100644 --- a/resources/recipes/deia.recipe +++ b/resources/recipes/deia.recipe @@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe): cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg' timefmt ='[%a, %d %b, %Y]' encoding ='utf8' - language ='es_ES' + language ='es' remove_javascript =True remove_tags_after =dict(id='Texto') remove_tags_before =dict(id='Texto') diff --git a/resources/recipes/el_publico.recipe b/resources/recipes/el_publico.recipe new file mode 100644 index 0000000000..d0da739b03 --- /dev/null +++ b/resources/recipes/el_publico.recipe @@ -0,0 +1,43 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__author__ = 'Gerardo Diez' +__copyright__ = 'Gerardo Diez' +description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)' +__docformat__ = 'restructuredtext en' + +''' +publico.es +''' +from calibre.web.feeds.recipes import BasicNewsRecipe +class Publico(BasicNewsRecipe): + title =u'Publico.es' + __author__ ='Gerardo Diez' + publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.' + category ='news, politics, finances, world, spain, science, catalunya' + oldest_article =1 + max_articles_per_feed =100 + simultaneous_downloads =10 + cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif' + timefmt ='[%a, %d %b, %Y]' + encoding ='utf8' + language ='es' + remove_javascript =True + no_stylesheets =True + keep_only_tags =dict(id='main') + remove_tags =[ + dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}), + dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}), + dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}), + dict(name='h5', attrs={'id':'comentarios'}) + ] + feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'), + (u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'), + (u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'), + (u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'), + (u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'), + (u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'), + (u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'), + (u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'), + (u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')] + + diff --git a/resources/recipes/elpais_impreso.recipe b/resources/recipes/elpais_impreso.recipe index 130013286c..b22a41dcec 100644 --- a/resources/recipes/elpais_impreso.recipe +++ b/resources/recipes/elpais_impreso.recipe @@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe): no_stylesheets = True encoding = 'cp1252' use_embedded_content = False - language = 'es_ES' + language = 'es' remove_empty_feeds = True publication_type = 'newspaper' masthead_url = 'http://www.elpais.com/im/tit_logo.gif' @@ -57,14 +57,14 @@ class ElPais_RSS(BasicNewsRecipe): ,(u'Madrid' , u'http://www.elpais.com/rss/feed.html?feedId=1016' ) ,(u'Pais Vasco' , u'http://www.elpais.com/rss/feed.html?feedId=17062') ,(u'Galicia' , u'http://www.elpais.com/rss/feed.html?feedId=17063') - ,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' ) - ,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' ) + ,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' ) + ,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' ) ,(u'Deportes' , u'http://www.elpais.com/rss/feed.html?feedId=1007' ) ,(u'Cultura' , u'http://www.elpais.com/rss/feed.html?feedId=1008' ) ,(u'Cine' , u'http://www.elpais.com/rss/feed.html?feedId=17052') ,(u'Literatura' , u'http://www.elpais.com/rss/feed.html?feedId=17053') ,(u'Musica' , u'http://www.elpais.com/rss/feed.html?feedId=17051') - ,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060') + ,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060') ,(u'Tecnologia' , u'http://www.elpais.com/rss/feed.html?feedId=1005' ) ,(u'Economia' , u'http://www.elpais.com/rss/feed.html?feedId=1006' ) ,(u'Ciencia' , u'http://www.elpais.com/rss/feed.html?feedId=17068') diff --git a/resources/recipes/exiled.recipe b/resources/recipes/exiled.recipe index 72dfc02e8b..6a65e22edc 100644 --- a/resources/recipes/exiled.recipe +++ b/resources/recipes/exiled.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2011, Darko Miletic ' ''' exiledonline.com ''' @@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe): use_embedded_content = False encoding = 'utf8' remove_javascript = True - language = 'en' - - cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif' - - html2lrf_options = [ - '--comment' , description - , '--base-font-size', '10' - , '--category' , category - , '--publisher' , publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + language = 'en' + publication_type = 'newsblog' + masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif} + #topslug{font-size: xx-large; font-weight: bold; color: red} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } keep_only_tags = [dict(name='div', attrs={'id':'main'})] @@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe): def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] - mtag = '\n\n\n' - soup.head.insert(0,mtag) + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) return soup def get_article_url(self, article): raw = article.get('link', None) final = raw + 'all/1/' return final - diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 29006ffd9b..ae111355e4 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -78,6 +78,8 @@ class DocAnalysis(object): linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL) elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) + elif format == 'txt': + linere = re.compile('.*?\n', re.DOTALL) self.lines = linere.findall(raw) def line_length(self, percent): @@ -561,8 +563,8 @@ class HTMLPreProcessor(object): html = html.replace(start, '') # convert ellipsis to entities to prevent wrapping - html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) + html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) # convert double dashes to em-dash - html = re.sub('\s--\s', u'\u2014', html) + html = re.sub(r'\s--\s', u'\u2014', html) return substitute_entites(html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 1bb232c911..27dacdf5fb 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -154,7 +154,7 @@ class PreProcessor(object): default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(]*>)?(?=<)" chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"], [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering @@ -184,7 +184,22 @@ class PreProcessor(object): self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters") return html - + def punctuation_unwrap(self, length, content, format): + # define the pieces of the regex + lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?" + blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" + line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" + txt_line_wrap = u"(\u0020|\u0009)*\n" + + unwrap_regex = lookahead+line_ending+blanklines+line_opening + if format == 'txt': + unwrap_regex = lookahead+txt_line_wrap + + unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) + content = unwrap.sub(' ', content) + return content + def __call__(self, html): self.log("********* Preprocessing HTML *********") @@ -194,7 +209,7 @@ class PreProcessor(object): totalwords = 0 totalwords = self.get_word_count(html) - if totalwords < 20: + if totalwords < 50: self.log("not enough text, not preprocessing") return html @@ -312,8 +327,7 @@ class PreProcessor(object): self.log("Done dehyphenating") # Unwrap lines using punctation and line length #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) - unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - html = unwrap.sub(' ', html) + html = self.punctuation_unwrap(length, html, 'html') #check any remaining hyphens, but only unwrap if there is a match dehyphenator = Dehyphenator() html = dehyphenator(html,'html_cleanup', length) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 47e92a45a9..98756c5fa1 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -90,11 +90,21 @@ class TXTInput(InputFormatPlugin): # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. - if options.paragraph_type == 'single': + if options.paragraph_type == 'single' or 'unformatted': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) + if options.paragraph_type == 'unformatted': + from calibre.ebooks.conversion.utils import PreProcessor + from calibre.ebooks.conversion.preprocess import DocAnalysis + # get length + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + # unwrap lines based on punctuation + preprocessor = PreProcessor(options, log=getattr(self, 'log', None)) + txt = preprocessor.punctuation_unwrap(length, txt, 'txt') + flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index f6d628e7c5..c6cf1078cd 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -9,6 +9,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.conversion.preprocess import DocAnalysis __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' @@ -101,27 +102,36 @@ def detect_paragraph_type(txt): single: Each line is a paragraph. print: Each paragraph starts with a 2+ spaces or a tab and ends when a new paragraph is reached. - markdown: Markdown formatting is in the document. + unformatted: most lines have hard line breaks, few/no spaces or indents - returns block, single, print, markdown + returns block, single, print, unformatted ''' txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) - # Check for print - tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .25: - return 'print' + # Check for hard line breaks - true if 55% of the doc breaks in the same region + docanalysis = DocAnalysis('txt', txt) + hardbreaks = docanalysis.line_histogram(.55) - # Check for block - empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .25: - return 'block' + if hardbreaks: + # Check for print + tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + if tab_line_count / float(txt_line_count) >= .25: + return 'print' + + # Check for block + empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + if empty_line_count / float(txt_line_count) >= .25: + return 'block' + + # Assume unformatted text with hardbreaks if nothing else matches + return 'unformatted' - # Nothing else matched to assume single. + # return single if hardbreaks is false return 'single' + def detect_formatting_type(txt): # Check for markdown # Headings diff --git a/src/calibre/gui2/wizard/send_email.py b/src/calibre/gui2/wizard/send_email.py index b9b65dc940..5785f52276 100644 --- a/src/calibre/gui2/wizard/send_email.py +++ b/src/calibre/gui2/wizard/send_email.py @@ -16,7 +16,7 @@ from PyQt4.Qt import QWidget, pyqtSignal, QDialog, Qt, QLabel, \ from calibre.gui2.wizard.send_email_ui import Ui_Form from calibre.utils.smtp import config as smtp_prefs from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog -from calibre.gui2 import error_dialog +from calibre.gui2 import error_dialog, question_dialog class TestEmail(QDialog, TE_Dialog): @@ -92,7 +92,10 @@ class SendEmail(QWidget, Ui_Form): pa = self.preferred_to_address() to_set = pa is not None if self.set_email_settings(to_set): - TestEmail(pa, self).exec_() + if question_dialog(self, _('OK to proceed?'), + _('This will display your email password on the screen' + '. Is it OK to proceed?'), show_copy_button=False): + TestEmail(pa, self).exec_() def test_email_settings(self, to): opts = smtp_prefs().parse() diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index df1341fc38..349800c8ba 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -21,7 +21,7 @@ from calibre.utils.config import config_dir from calibre.utils.date import format_date, isoformat, now as nowf from calibre.utils.logging import default_log as log from calibre.utils.zipfile import ZipFile, ZipInfo -from calibre.utils.magick.draw import identify_data, thumbnail +from calibre.utils.magick.draw import thumbnail FIELDS = ['all', 'author_sort', 'authors', 'comments', 'cover', 'formats', 'id', 'isbn', 'ondevice', 'pubdate', 'publisher', 'rating', @@ -2863,32 +2863,15 @@ class EPUB_MOBI(CatalogPlugin): i/float(len(self.booksByTitle))) thumb_file = 'thumbnail_%d.jpg' % int(title['id']) - valid_cover = True + thumb_generated = True try: - _w, _h, _fmt = identify_data(open(title['cover'], 'rb').read()) - except: - valid_cover = False - - if valid_cover: - # Add the thumb spec to thumbs[] - thumbs.append("thumbnail_%d.jpg" % int(title['id'])) self.generateThumbnail(title, image_dir, thumb_file) - ''' - # Check to see if thumbnail exists - thumb_fp = "%s/thumbnail_%d.jpg" % (image_dir,int(title['id'])) - thumb_file = 'thumbnail_%d.jpg' % int(title['id']) - if os.path.isfile(thumb_fp): - # Check to see if cover is newer than thumbnail - # os.path.getmtime() = modified time - # os.path.ctime() = creation time - cover_timestamp = os.path.getmtime(title['cover']) - thumb_timestamp = os.path.getmtime(thumb_fp) - if thumb_timestamp < cover_timestamp: - self.generateThumbnail(title, image_dir, thumb_file) - else: - self.generateThumbnail(title, image_dir, thumb_file) - ''' - else: + thumbs.append("thumbnail_%d.jpg" % int(title['id'])) + except: + thumb_generated = False + + + if not thumb_generated: # Use default cover if False and self.verbose: self.opts.log.warn(" using default cover for '%s'" % \ diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 3a7ae16598..4b2b169d72 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -533,17 +533,23 @@ PDF documents are one of the worst formats to convert from. They are a fixed pag Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length at which a line should be unwrapped. Valid values are a decimal -between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more -text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under PDF Input. +between 0 and 1. The default is 0.45, just under the median line length. Lower this value to include more +text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`. Also, they often have headers and footers as part of the document that will become included with the text. Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not -removed from the text it can throw off the paragraph unwrapping. +removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read +:ref:`regexptutorial`. -Some limitations of PDF input is complex, multi-column, and image based documents are not supported. -Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to -represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are -represented internally in the PDF. +Some limitations of PDF input are: + + * Complex, multi-column, and image based documents are not supported. + * Extraction of vector images and tables from within the document is also not supported. + * Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF. + * Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well. + +To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an +output ranging anywhere from decent to unusable, depending on the input PDF. Comic Book Collections ~~~~~~~~~~~~~~~~~~~~~~~~~