From 09ff8524214cc51091f8ec8dca616e2675e40789 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Jan 2011 06:53:24 -0700 Subject: [PATCH 01/14] El Publico by Gerardo Diez. Fixes #405 (New news feed) --- resources/recipes/deia.recipe | 2 +- resources/recipes/el_publico.recipe | 43 +++++++++++++++++++++++++ resources/recipes/elpais_impreso.recipe | 8 ++--- 3 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 resources/recipes/el_publico.recipe diff --git a/resources/recipes/deia.recipe b/resources/recipes/deia.recipe index 980d59d3d1..5d39be9a10 100644 --- a/resources/recipes/deia.recipe +++ b/resources/recipes/deia.recipe @@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe): cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg' timefmt ='[%a, %d %b, %Y]' encoding ='utf8' - language ='es_ES' + language ='es' remove_javascript =True remove_tags_after =dict(id='Texto') remove_tags_before =dict(id='Texto') diff --git a/resources/recipes/el_publico.recipe b/resources/recipes/el_publico.recipe new file mode 100644 index 0000000000..d0da739b03 --- /dev/null +++ b/resources/recipes/el_publico.recipe @@ -0,0 +1,43 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__author__ = 'Gerardo Diez' +__copyright__ = 'Gerardo Diez' +description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)' +__docformat__ = 'restructuredtext en' + +''' +publico.es +''' +from calibre.web.feeds.recipes import BasicNewsRecipe +class Publico(BasicNewsRecipe): + title =u'Publico.es' + __author__ ='Gerardo Diez' + publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.' + category ='news, politics, finances, world, spain, science, catalunya' + oldest_article =1 + max_articles_per_feed =100 + simultaneous_downloads =10 + cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif' + timefmt ='[%a, %d %b, %Y]' + encoding ='utf8' + language ='es' + remove_javascript =True + no_stylesheets =True + keep_only_tags =dict(id='main') + remove_tags =[ + dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}), + dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}), + dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}), + dict(name='h5', attrs={'id':'comentarios'}) + ] + feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'), + (u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'), + (u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'), + (u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'), + (u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'), + (u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'), + (u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'), + (u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'), + (u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')] + + diff --git a/resources/recipes/elpais_impreso.recipe b/resources/recipes/elpais_impreso.recipe index 130013286c..b22a41dcec 100644 --- a/resources/recipes/elpais_impreso.recipe +++ b/resources/recipes/elpais_impreso.recipe @@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe): no_stylesheets = True encoding = 'cp1252' use_embedded_content = False - language = 'es_ES' + language = 'es' remove_empty_feeds = True publication_type = 'newspaper' masthead_url = 'http://www.elpais.com/im/tit_logo.gif' @@ -57,14 +57,14 @@ class ElPais_RSS(BasicNewsRecipe): ,(u'Madrid' , u'http://www.elpais.com/rss/feed.html?feedId=1016' ) ,(u'Pais Vasco' , u'http://www.elpais.com/rss/feed.html?feedId=17062') ,(u'Galicia' , u'http://www.elpais.com/rss/feed.html?feedId=17063') - ,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' ) - ,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' ) + ,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' ) + ,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' ) ,(u'Deportes' , u'http://www.elpais.com/rss/feed.html?feedId=1007' ) ,(u'Cultura' , u'http://www.elpais.com/rss/feed.html?feedId=1008' ) ,(u'Cine' , u'http://www.elpais.com/rss/feed.html?feedId=17052') ,(u'Literatura' , u'http://www.elpais.com/rss/feed.html?feedId=17053') ,(u'Musica' , u'http://www.elpais.com/rss/feed.html?feedId=17051') - ,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060') + ,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060') ,(u'Tecnologia' , u'http://www.elpais.com/rss/feed.html?feedId=1005' ) ,(u'Economia' , u'http://www.elpais.com/rss/feed.html?feedId=1006' ) ,(u'Ciencia' , u'http://www.elpais.com/rss/feed.html?feedId=17068') From 823cdcc4373bc523a0ba584e0eb82febb7d1f231 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Jan 2011 07:27:08 -0700 Subject: [PATCH 02/14] ... --- src/calibre/manual/conversion.rst | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 3a7ae16598..a5aad9b450 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -533,17 +533,22 @@ PDF documents are one of the worst formats to convert from. They are a fixed pag Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length at which a line should be unwrapped. Valid values are a decimal -between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more -text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under PDF Input. +between 0 and 1. The default is 0.45, just under the median line length. Lower this value to include more +text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`. Also, they often have headers and footers as part of the document that will become included with the text. Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not removed from the text it can throw off the paragraph unwrapping. -Some limitations of PDF input is complex, multi-column, and image based documents are not supported. -Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to -represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are -represented internally in the PDF. +Some limitations of PDF input are: + + * Complex, multi-column, and image based documents are not supported. + * Extraction of vector images and tables from within the document is also not supported. + * Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF. + * Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well. + +To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an +output ranging anywhere from decent to unusable, depending on the input PDF. Comic Book Collections ~~~~~~~~~~~~~~~~~~~~~~~~~ From 8ac2dd0a65776aafcb8132aca5f256c9fcb4acd4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Jan 2011 07:46:55 -0700 Subject: [PATCH 03/14] Email settings: Before displaying the email test dialog warn the user that it will expose their email password --- src/calibre/gui2/wizard/send_email.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/wizard/send_email.py b/src/calibre/gui2/wizard/send_email.py index b9b65dc940..5785f52276 100644 --- a/src/calibre/gui2/wizard/send_email.py +++ b/src/calibre/gui2/wizard/send_email.py @@ -16,7 +16,7 @@ from PyQt4.Qt import QWidget, pyqtSignal, QDialog, Qt, QLabel, \ from calibre.gui2.wizard.send_email_ui import Ui_Form from calibre.utils.smtp import config as smtp_prefs from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog -from calibre.gui2 import error_dialog +from calibre.gui2 import error_dialog, question_dialog class TestEmail(QDialog, TE_Dialog): @@ -92,7 +92,10 @@ class SendEmail(QWidget, Ui_Form): pa = self.preferred_to_address() to_set = pa is not None if self.set_email_settings(to_set): - TestEmail(pa, self).exec_() + if question_dialog(self, _('OK to proceed?'), + _('This will display your email password on the screen' + '. Is it OK to proceed?'), show_copy_button=False): + TestEmail(pa, self).exec_() def test_email_settings(self, to): opts = smtp_prefs().parse() From 4abfeed6accf655c8f61f05bc7027de6b8ecad27 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Jan 2011 08:29:40 -0700 Subject: [PATCH 04/14] ... --- src/calibre/manual/conversion.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index a5aad9b450..4b2b169d72 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -538,7 +538,8 @@ text in the unwrapping. Increase to include less. You can adjust this value in t Also, they often have headers and footers as part of the document that will become included with the text. Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not -removed from the text it can throw off the paragraph unwrapping. +removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read +:ref:`regexptutorial`. Some limitations of PDF input are: From 8f7d8c1022533ef5fd07f6162b03672cadafcb92 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Jan 2011 10:17:36 -0700 Subject: [PATCH 05/14] Fix #8241 (Updated recipe for Exiled online) --- resources/images/news/exiled.png | Bin 0 -> 1352 bytes resources/recipes/exiled.recipe | 37 ++++++++++++++++--------------- 2 files changed, 19 insertions(+), 18 deletions(-) create mode 100644 resources/images/news/exiled.png diff --git a/resources/images/news/exiled.png b/resources/images/news/exiled.png new file mode 100644 index 0000000000000000000000000000000000000000..c233aaf132d07704afa1841db6ddb886d0a76593 GIT binary patch literal 1352 zcmeAS@N?(olHy`uVBq!ia0vp^0w65F1|y3`~05O_bCBhr!<(hh#fNN8|y#?;i*Ri#D1>tiAti^fsE znLO;1CULxY`}VbRR`%=HuivV!x<22w`rn-M!iFDA61jr27g^-x=H{xNO3Aw~C#krp zgF&;?=kwp+{PG<|JAeM%xic+kU8hI(J6{1FzIm3;Dyjbd`TFeUJW7XGTwN8q$S1An z(^=_7)5MNzPLAYcySQ}tZA zWfzx)-@<}75(&3$ZCjZHj%#b%*A+OeooyC7+iF4SB!(cblT&RfKYh^*Sf6fG|Cg1e zF_Vi?rBk4damTxTwXv}dBDH@k%-Jo@?JQ0|&(ycMT>t<7FPU|69zts$&(OX<=ZAU2 z7SVd8kX08soEOCGtNVLORH4h~vcV2zCx*Zy;X&)m%ii7LwD7C0w&TC@`tjrSafOPW zmwf)DKi$vxKtAl%43%xMdd%BmHDqkLJWkH(P`a$bB*pvg%&%WpS6^Sp=ykMEhS~Gb zoq$!G4lDn&D!gq8@?`q)@Amfh_ZY8cMFYWSttBb1D|g0ZYd`qCKHlH2h_Ud`eFmO; zNr|4i+`RpsS_&>fUY<*)q#Ek-HlP2T=iU9DopIv(`@2kyoR);Fe6s7!{8wJ9IHRXM z-jTQTkjKu{m`ba--eZp*P1kRDEMu{u-u{6FGxKtP=Im^7Jsw^|OWtN{C6;*3$t4vP z6%rGScxwO0TWr|&ta7UqbHJ*Z7M@;9=i5h$`^^PLUNbwtEYqWk%$K*#6+FX$fZ?`X z(URS>za|+^D5BImr+W_=WE&hJM@y{FGAe*aro`0-=k zgMnUNOB;H7J3Bi$4}9KqGiMs(hCG7_@%zi(ym-NIsGzE)an3BS zr$0VAF?K9!ZT(ppcFBMxqqMZ7q(o%H+Z?m`>{(lV)Fv~UndzL~#Be{|b8Se`&K$9b zD65(syQMjrT3cILAG{71Q@DQi`R&^?g_m)%G$zhS3!ZejV%yx)Pd^`c^XBjG@^=hM z9Cr13VvK92Pv2jkZ@2zsHQR^Z)Akj48BX7x{~id`KmASG7@@LB=ltsH+ly@HPAFOG zzfr&?@XFP#>HHV(uW?q{)1Y8p3M`3KOI#yLQW8s2t&)pUffR$0fuW(U0T5Y)7#dg^ znp&Bf>l&C_85nFmXte}ILvDUbW?ChR1`{g-3oAnlhz6(dLvMi^7(8A5T-G@yGywq1 Cm_*M2 literal 0 HcmV?d00001 diff --git a/resources/recipes/exiled.recipe b/resources/recipes/exiled.recipe index 72dfc02e8b..6a65e22edc 100644 --- a/resources/recipes/exiled.recipe +++ b/resources/recipes/exiled.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2011, Darko Miletic ' ''' exiledonline.com ''' @@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe): use_embedded_content = False encoding = 'utf8' remove_javascript = True - language = 'en' - - cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif' - - html2lrf_options = [ - '--comment' , description - , '--base-font-size', '10' - , '--category' , category - , '--publisher' , publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + language = 'en' + publication_type = 'newsblog' + masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif} + #topslug{font-size: xx-large; font-weight: bold; color: red} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } keep_only_tags = [dict(name='div', attrs={'id':'main'})] @@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe): def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] - mtag = '\n\n\n' - soup.head.insert(0,mtag) + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) return soup def get_article_url(self, article): raw = article.get('link', None) final = raw + 'all/1/' return final - From 611c0373573a6ad74cc0ba5b4d4b8a5788760651 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Jan 2011 10:52:29 -0700 Subject: [PATCH 06/14] ... --- src/calibre/ebooks/conversion/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 97aaa653a9..ae111355e4 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -563,8 +563,8 @@ class HTMLPreProcessor(object): html = html.replace(start, '') # convert ellipsis to entities to prevent wrapping - html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) + html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) # convert double dashes to em-dash - html = re.sub('\s--\s', u'\u2014', html) + html = re.sub(r'\s--\s', u'\u2014', html) return substitute_entites(html) From 843e1f2068cf1707f7f002be7c05c37282e9fa36 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 13:17:32 -0500 Subject: [PATCH 07/14] TXT Input: Basic heuristic processor. --- src/calibre/ebooks/txt/heuristicprocessor.py | 88 ++++++++++++++++++++ src/calibre/ebooks/txt/input.py | 12 ++- src/calibre/ebooks/txt/processor.py | 23 ++++- 3 files changed, 116 insertions(+), 7 deletions(-) create mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py new file mode 100644 index 0000000000..cbfa33a96a --- /dev/null +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +import re +import string + +from calibre import prepare_string_for_xml +from calibre.ebooks.unidecode.unidecoder import Unidecoder + +class TXTHeuristicProcessor(object): + + def __init__(self): + self.ITALICIZE_WORDS = [ + 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.', + 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.', + 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.', + 'Mlle.', 'Mons.', 'PS.', 'PPS.', + ] + self.ITALICIZE_STYLE_PATS = [ + r'(?msu)_(?P.+?)_', + r'(?msu)/(?P.+?)/', + r'(?msu)~~(?P.+?)~~', + r'(?msu)\*(?P.+?)\*', + r'(?msu)~(?P.+?)~', + r'(?msu)_/(?P.+?)/_', + r'(?msu)_\*(?P.+?)\*_', + r'(?msu)\*/(?P.+?)/\*', + r'(?msu)_\*/(?P.+?)/\*_', + r'(?msu)/:(?P.+?):/', + r'(?msu)\|:(?P.+?):\|', + ] + + def del_maketrans(self, deletechars): + return dict([(ord(x), u'') for x in deletechars]) + + def is_heading(self, line): + if not line: + return False + if len(line) > 40: + return False + + line = Unidecoder().decode(line) + + # punctuation. + if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')): + return False + + # All upper case. + #if line.isupper(): + # return True + # Roman numerals. + #if not line.translate(self.del_maketrans('IVXYCivxyc ')): + # return True + + return True + + def process_paragraph(self, paragraph): + for word in self.ITALICIZE_WORDS: + paragraph = paragraph.replace(word, '%s' % word) + for pat in self.ITALICIZE_STYLE_PATS: + paragraph = re.sub(pat, lambda mo: '%s' % mo.group('words'), paragraph) + return paragraph + + def convert(self, txt, title='', epub_split_size_kb=0): + from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE + txt = clean_txt(txt) + txt = split_txt(txt, epub_split_size_kb) + + processed = [] + last_was_heading = False + for line in txt.split('\n\n'): + if self.is_heading(line): + if not last_was_heading: + processed.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) + else: + processed.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) + last_was_heading = True + else: + processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) + last_was_heading = False + + txt = u'\n'.join(processed) + txt = re.sub('[ ]{2,}', ' ', txt) + + return HTML_TEMPLATE % (title, txt) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 47e92a45a9..fd805f8ce8 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces, detect_paragraph_type, detect_formatting_type + preserve_spaces, detect_paragraph_type, detect_formatting_type, \ + convert_heuristic from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin): '* print: Assume every line starting with 2+ spaces or a tab ' 'starts a paragraph.')), OptionRecommendation(name='formatting_type', recommended_value='auto', - choices=['auto', 'none', 'markdown'], + choices=['auto', 'none', 'heuristic', 'markdown'], help=_('Formatting used within the document.' '* auto: Try to auto detect the document formatting.\n' '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' @@ -96,7 +97,12 @@ class TXTInput(InputFormatPlugin): txt = separate_paragraphs_print_formatted(txt) flow_size = getattr(options, 'flow_size', 0) - html = convert_basic(txt, epub_split_size_kb=flow_size) + + if options.formatting_type == 'heuristic': + html = convert_heuristic(txt, epub_split_size_kb=flow_size) + else: + html = convert_basic(txt, epub_split_size_kb=flow_size) + from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index f6d628e7c5..79eee79c29 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -9,6 +9,7 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' @@ -16,7 +17,7 @@ __docformat__ = 'restructuredtext en' HTML_TEMPLATE = u'%s\n%s\n' -def convert_basic(txt, title='', epub_split_size_kb=0): +def clean_txt(txt): if isbytestring(txt): txt = txt.decode('utf-8', 'replace') # Strip whitespace from the beginning and end of the line. Also replace @@ -35,6 +36,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0): chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) illegal_chars = re.compile(u'|'.join(map(unichr, chars))) txt = illegal_chars.sub('', txt) + + return txt + +def split_txt(txt, epub_split_size_kb=0): #Takes care if there is no point to split if epub_split_size_kb > 0: if isinstance(txt, unicode): @@ -49,6 +54,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0): if isbytestring(txt): txt = txt.decode('utf-8') + return txt + +def convert_basic(txt, title='', epub_split_size_kb=0): + txt = clean_txt(txt) + txt = split_txt(txt, epub_split_size_kb) + lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): @@ -57,6 +68,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0): return HTML_TEMPLATE % (title, u'\n'.join(lines)) +def convert_heuristic(txt, title='', epub_split_size_kb=0): + tp = TXTHeuristicProcessor() + return tp.convert(txt, title, epub_split_size_kb) + def convert_markdown(txt, title='', disable_toc=False): md = markdown.Markdown( extensions=['footnotes', 'tables', 'toc'], @@ -111,12 +126,12 @@ def detect_paragraph_type(txt): # Check for print tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .25: + if tab_line_count / float(txt_line_count) >= .15: return 'print' # Check for block empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .25: + if empty_line_count / float(txt_line_count) >= .15: return 'block' # Nothing else matched to assume single. @@ -143,4 +158,4 @@ def detect_formatting_type(txt): if txt.count('\\'+c) > 10: return 'markdown' - return 'none' + return 'heuristic' From c5a679a437c7ab52bb0320c83eef4535c151feb5 Mon Sep 17 00:00:00 2001 From: GRiker Date: Sat, 8 Jan 2011 11:42:31 -0700 Subject: [PATCH 08/14] GwR patch for bogus cover data --- src/calibre/library/catalog.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 0a5d5284e2..1af9c3aa58 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -21,7 +21,7 @@ from calibre.utils.config import config_dir from calibre.utils.date import format_date, isoformat, now as nowf from calibre.utils.logging import default_log as log from calibre.utils.zipfile import ZipFile, ZipInfo -from calibre.utils.magick.draw import thumbnail +from calibre.utils.magick.draw import identify_data, thumbnail FIELDS = ['all', 'author_sort', 'authors', 'comments', 'cover', 'formats', 'id', 'isbn', 'ondevice', 'pubdate', 'publisher', 'rating', @@ -2861,11 +2861,19 @@ class EPUB_MOBI(CatalogPlugin): self.updateProgressMicroStep("Thumbnail %d of %d" % \ (i,len(self.booksByTitle)), i/float(len(self.booksByTitle))) - # Check to see if source file exists - if 'cover' in title and os.path.isfile(title['cover']): + + # Confirm existence, integrity of cover image + valid_cover = True + try: + _w, _h, _fmt = identify_data(open(title['cover'], 'rb').read()) + except: + valid_cover = False + + if valid_cover: # Add the thumb spec to thumbs[] thumbs.append("thumbnail_%d.jpg" % int(title['id'])) - + self.generateThumbnail(title, image_dir, thumb_file) + ''' # Check to see if thumbnail exists thumb_fp = "%s/thumbnail_%d.jpg" % (image_dir,int(title['id'])) thumb_file = 'thumbnail_%d.jpg' % int(title['id']) @@ -2879,6 +2887,7 @@ class EPUB_MOBI(CatalogPlugin): self.generateThumbnail(title, image_dir, thumb_file) else: self.generateThumbnail(title, image_dir, thumb_file) + ''' else: # Use default cover if False and self.verbose: From 8a44bf07edf1b3282a65edd044421b963d4dd794 Mon Sep 17 00:00:00 2001 From: GRiker Date: Sat, 8 Jan 2011 11:48:41 -0700 Subject: [PATCH 09/14] GwR patch for bogus cover data --- src/calibre/library/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 1af9c3aa58..df1341fc38 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -2862,7 +2862,7 @@ class EPUB_MOBI(CatalogPlugin): (i,len(self.booksByTitle)), i/float(len(self.booksByTitle))) - # Confirm existence, integrity of cover image + thumb_file = 'thumbnail_%d.jpg' % int(title['id']) valid_cover = True try: _w, _h, _fmt = identify_data(open(title['cover'], 'rb').read()) From f593b2163154bcd61e21b0e06f8cf0e29514af86 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 13:53:32 -0500 Subject: [PATCH 10/14] TXT Input: Tweak Heuristic italicizing. --- src/calibre/ebooks/txt/heuristicprocessor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py index cbfa33a96a..b0bbd49961 100644 --- a/src/calibre/ebooks/txt/heuristicprocessor.py +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -21,15 +21,15 @@ class TXTHeuristicProcessor(object): ] self.ITALICIZE_STYLE_PATS = [ r'(?msu)_(?P.+?)_', - r'(?msu)/(?P.+?)/', + r'(?msu)/(?P[^<>]+?)/', r'(?msu)~~(?P.+?)~~', r'(?msu)\*(?P.+?)\*', r'(?msu)~(?P.+?)~', - r'(?msu)_/(?P.+?)/_', + r'(?msu)_/(?P[^<>]+?)/_', r'(?msu)_\*(?P.+?)\*_', - r'(?msu)\*/(?P.+?)/\*', - r'(?msu)_\*/(?P.+?)/\*_', - r'(?msu)/:(?P.+?):/', + r'(?msu)\*/(?P[^<>]+?)/\*', + r'(?msu)_\*/(?P[^<>]+?)/\*_', + r'(?msu)/:(?P[^<>]+?):/', r'(?msu)\|:(?P.+?):\|', ] @@ -84,5 +84,6 @@ class TXTHeuristicProcessor(object): txt = u'\n'.join(processed) txt = re.sub('[ ]{2,}', ' ', txt) + print txt return HTML_TEMPLATE % (title, txt) From c8f18ff02e32f56220f83872f4def00cca58e73d Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 15:49:10 -0500 Subject: [PATCH 11/14] TXT Input: Heuristic processor, use PreProcessor to mark chapter headings. --- src/calibre/ebooks/txt/heuristicprocessor.py | 43 ++++---------------- src/calibre/ebooks/txt/processor.py | 3 -- 2 files changed, 7 insertions(+), 39 deletions(-) diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py index b0bbd49961..c4489badc5 100644 --- a/src/calibre/ebooks/txt/heuristicprocessor.py +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object): r'(?msu)\|:(?P.+?):\|', ] - def del_maketrans(self, deletechars): - return dict([(ord(x), u'') for x in deletechars]) - - def is_heading(self, line): - if not line: - return False - if len(line) > 40: - return False - - line = Unidecoder().decode(line) - - # punctuation. - if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')): - return False - - # All upper case. - #if line.isupper(): - # return True - # Roman numerals. - #if not line.translate(self.del_maketrans('IVXYCivxyc ')): - # return True - - return True - def process_paragraph(self, paragraph): for word in self.ITALICIZE_WORDS: paragraph = paragraph.replace(word, '%s' % word) @@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object): txt = split_txt(txt, epub_split_size_kb) processed = [] - last_was_heading = False for line in txt.split('\n\n'): - if self.is_heading(line): - if not last_was_heading: - processed.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) - else: - processed.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) - last_was_heading = True - else: - processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) - last_was_heading = False + processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) txt = u'\n'.join(processed) txt = re.sub('[ ]{2,}', ' ', txt) - print txt + html = HTML_TEMPLATE % (title, txt) + + from calibre.ebooks.conversion.utils import PreProcessor + pp = PreProcessor() + html = pp.markup_chapters(html, pp.get_word_count(html), False) - return HTML_TEMPLATE % (title, txt) + return html diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 1e67caccc6..9dc29e45dd 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -9,11 +9,8 @@ import os, re from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator -<<<<<<< TREE from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor -======= from calibre.ebooks.conversion.preprocess import DocAnalysis ->>>>>>> MERGE-SOURCE __license__ = 'GPL v3' __copyright__ = '2009, John Schember ' From bd14205637cbf71fe4aad655de50f4f0fea98a60 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 15:53:51 -0500 Subject: [PATCH 12/14] ... --- src/calibre/ebooks/txt/heuristicprocessor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py index c4489badc5..c4c6a56123 100644 --- a/src/calibre/ebooks/txt/heuristicprocessor.py +++ b/src/calibre/ebooks/txt/heuristicprocessor.py @@ -5,7 +5,6 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' import re -import string from calibre import prepare_string_for_xml from calibre.ebooks.unidecode.unidecoder import Unidecoder @@ -48,7 +47,7 @@ class TXTHeuristicProcessor(object): processed = [] for line in txt.split('\n\n'): processed.append(u'

%s

' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' ')))) - + txt = u'\n'.join(processed) txt = re.sub('[ ]{2,}', ' ', txt) html = HTML_TEMPLATE % (title, txt) From 831ee1fc81b50d9ccd7c771161db322715fa3092 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 16:53:54 -0500 Subject: [PATCH 13/14] TXT Input: Add documentation for the heuristic formatting option to the option help. --- src/calibre/ebooks/txt/input.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5060e124ff..c8ce389574 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -37,6 +37,8 @@ class TXTInput(InputFormatPlugin): help=_('Formatting used within the document.' '* auto: Try to auto detect the document formatting.\n' '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' + '* heuristic: Try to detect formatting for elements such as chapter headings ' + 'and style the elements appropriately.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, From 12cbaa2304db610ccf101bbd4abe13ff58f68fee Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 8 Jan 2011 17:26:32 -0500 Subject: [PATCH 14/14] TXT Input: Make formatting_type options easier to understand. --- src/calibre/ebooks/txt/input.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index c8ce389574..e782cd0cd9 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -35,11 +35,12 @@ class TXTInput(InputFormatPlugin): OptionRecommendation(name='formatting_type', recommended_value='auto', choices=['auto', 'none', 'heuristic', 'markdown'], help=_('Formatting used within the document.' - '* auto: Try to auto detect the document formatting.\n' - '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' - '* heuristic: Try to detect formatting for elements such as chapter headings ' - 'and style the elements appropriately.\n' - '* markdown: Run the input though the markdown pre-processor. ' + '* auto: Automatically decide which formatting processor to use.\n' + '* none: Do not process the document formatting. Everything is a ' + 'paragraph and no styling is applied.\n' + '* heuristic: Process using heuristics to determine formatting such ' + 'as chapter headings and italic text.\n' + '* markdown: Processing using markdown formatting. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. '