Sync to trunk. Misc TXT input fixes.

This commit is contained in:
John Schember 2011-01-08 15:37:22 -05:00
commit 0b08042d46
12 changed files with 156 additions and 71 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
timefmt ='[%a, %d %b, %Y]'
encoding ='utf8'
language ='es_ES'
language ='es'
remove_javascript =True
remove_tags_after =dict(id='Texto')
remove_tags_before =dict(id='Texto')

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Gerardo Diez'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
__docformat__ = 'restructuredtext en'
'''
publico.es
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Publico.es'
__author__ ='Gerardo Diez'
publisher =u'Mediapubli Sociedad de Publicaciones y Ediciones S.L.'
category ='news, politics, finances, world, spain, science, catalunya'
oldest_article =1
max_articles_per_feed =100
simultaneous_downloads =10
cover_url =u'http://imagenes.publico.es/css/img/logo_publico.gif'
timefmt ='[%a, %d %b, %Y]'
encoding ='utf8'
language ='es'
remove_javascript =True
no_stylesheets =True
keep_only_tags =dict(id='main')
remove_tags =[
dict(name='div', attrs={'class':['Noticias_642x50', 'contInfo ancho']}),
dict(name='ul', attrs={'class':['navComentarios', 'comentarios']}),
dict(name='div', attrs={'id':['commentsContext', 'toolbar', 'comentarios']}),
dict(name='h5', attrs={'id':'comentarios'})
]
feeds =[(u'Internacional', u'http://www.publico.es/estaticos/rss/internacional'),
(u'Espa\xf1a', u'http://www.publico.es/estaticos/rss/espana'),
(u'Dinero', u'http://www.publico.es/estaticos/rss/dinero'),
(u'Ciencias', u'http://www.publico.es/estaticos/rss/ciencias'),
(u'Culturas', u'http://www.publico.es/estaticos/rss/culturas'),
(u'Deportes', u'http://www.publico.es/estaticos/rss/deportes'),
(u'Televisi\xf3n y Gente', u'http://www.publico.es/estaticos/rss/televisionygente'),
(u'Catalu\xf1a', u'http://www.publico.es/estaticos/rss/catalunya'),
(u'Viajes', u'http://www.publico.es/estaticos/rss/viajes')]

View File

@ -17,7 +17,7 @@ class ElPais_RSS(BasicNewsRecipe):
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es_ES'
language = 'es'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.elpais.com/im/tit_logo.gif'

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
exiledonline.com
'''
@ -20,18 +18,20 @@ class Exiled(BasicNewsRecipe):
use_embedded_content = False
encoding = 'utf8'
remove_javascript = True
language = 'en'
language = 'en'
publication_type = 'newsblog'
masthead_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
#topslug{font-size: xx-large; font-weight: bold; color: red}
"""
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
html2lrf_options = [
'--comment' , description
, '--base-font-size', '10'
, '--category' , category
, '--publisher' , publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
@ -47,12 +47,13 @@ class Exiled(BasicNewsRecipe):
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
soup.head.insert(0,mtag)
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
def get_article_url(self, article):
raw = article.get('link', None)
final = raw + 'all/1/'
return final

View File

@ -78,6 +78,8 @@ class DocAnalysis(object):
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt':
linere = re.compile('.*?\n', re.DOTALL)
self.lines = linere.findall(raw)
def line_length(self, percent):
@ -561,8 +563,8 @@ class HTMLPreProcessor(object):
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping
html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
# convert double dashes to em-dash
html = re.sub('\s--\s', u'\u2014', html)
html = re.sub(r'\s--\s', u'\u2014', html)
return substitute_entites(html)

View File

@ -154,7 +154,7 @@ class PreProcessor(object):
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
@ -184,6 +184,21 @@ class PreProcessor(object):
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
return html
def punctuation_unwrap(self, length, content, format):
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
txt_line_wrap = u"(\u0020|\u0009)*\n"
unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content)
return content
def __call__(self, html):
@ -194,7 +209,7 @@ class PreProcessor(object):
totalwords = 0
totalwords = self.get_word_count(html)
if totalwords < 20:
if totalwords < 50:
self.log("not enough text, not preprocessing")
return html
@ -312,8 +327,7 @@ class PreProcessor(object):
self.log("Done dehyphenating")
# Unwrap lines using punctation and line length
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html)
html = self.punctuation_unwrap(length, html, 'html')
#check any remaining hyphens, but only unwrap if there is a match
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length)

View File

@ -25,12 +25,13 @@ class TXTInput(InputFormatPlugin):
OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=['auto', 'block', 'single', 'print'],
help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
'* auto: Try to auto detect paragraph type.\n'
'* block: Treat a blank line as a paragraph break.\n'
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.')),
'starts a paragraph.'
'* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.'
@ -91,11 +92,21 @@ class TXTInput(InputFormatPlugin):
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
if options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import PreProcessor
from calibre.ebooks.conversion.preprocess import DocAnalysis
# get length
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
# unwrap lines based on punctuation
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
flow_size = getattr(options, 'flow_size', 0)
if options.formatting_type == 'heuristic':

View File

@ -9,7 +9,11 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator
<<<<<<< TREE
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
=======
from calibre.ebooks.conversion.preprocess import DocAnalysis
>>>>>>> MERGE-SOURCE
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -116,27 +120,36 @@ def detect_paragraph_type(txt):
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
markdown: Markdown formatting is in the document.
unformatted: most lines have hard line breaks, few/no spaces or indents
returns block, single, print, markdown
returns block, single, print, unformatted
'''
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .15:
return 'print'
# Check for hard line breaks - true if 55% of the doc breaks in the same region
docanalysis = DocAnalysis('txt', txt)
hardbreaks = docanalysis.line_histogram(.55)
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .15:
return 'block'
if hardbreaks:
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .15:
return 'print'
# Nothing else matched to assume single.
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .15:
return 'block'
# Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted'
# return single if hardbreaks is false
return 'single'
def detect_formatting_type(txt):
# Check for markdown
# Headings

View File

@ -16,7 +16,7 @@ from PyQt4.Qt import QWidget, pyqtSignal, QDialog, Qt, QLabel, \
from calibre.gui2.wizard.send_email_ui import Ui_Form
from calibre.utils.smtp import config as smtp_prefs
from calibre.gui2.dialogs.test_email_ui import Ui_Dialog as TE_Dialog
from calibre.gui2 import error_dialog
from calibre.gui2 import error_dialog, question_dialog
class TestEmail(QDialog, TE_Dialog):
@ -92,7 +92,10 @@ class SendEmail(QWidget, Ui_Form):
pa = self.preferred_to_address()
to_set = pa is not None
if self.set_email_settings(to_set):
TestEmail(pa, self).exec_()
if question_dialog(self, _('OK to proceed?'),
_('This will display your email password on the screen'
'. Is it OK to proceed?'), show_copy_button=False):
TestEmail(pa, self).exec_()
def test_email_settings(self, to):
opts = smtp_prefs().parse()

View File

@ -2861,25 +2861,17 @@ class EPUB_MOBI(CatalogPlugin):
self.updateProgressMicroStep("Thumbnail %d of %d" % \
(i,len(self.booksByTitle)),
i/float(len(self.booksByTitle)))
# Check to see if source file exists
if 'cover' in title and os.path.isfile(title['cover']):
# Add the thumb spec to thumbs[]
thumbs.append("thumbnail_%d.jpg" % int(title['id']))
# Check to see if thumbnail exists
thumb_fp = "%s/thumbnail_%d.jpg" % (image_dir,int(title['id']))
thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
if os.path.isfile(thumb_fp):
# Check to see if cover is newer than thumbnail
# os.path.getmtime() = modified time
# os.path.ctime() = creation time
cover_timestamp = os.path.getmtime(title['cover'])
thumb_timestamp = os.path.getmtime(thumb_fp)
if thumb_timestamp < cover_timestamp:
self.generateThumbnail(title, image_dir, thumb_file)
else:
self.generateThumbnail(title, image_dir, thumb_file)
else:
thumb_file = 'thumbnail_%d.jpg' % int(title['id'])
thumb_generated = True
try:
self.generateThumbnail(title, image_dir, thumb_file)
thumbs.append("thumbnail_%d.jpg" % int(title['id']))
except:
thumb_generated = False
if not thumb_generated:
# Use default cover
if False and self.verbose:
self.opts.log.warn(" using default cover for '%s'" % \

View File

@ -533,17 +533,23 @@ PDF documents are one of the worst formats to convert from. They are a fixed pag
Meaning, it is very difficult to determine where one paragraph ends and another begins. |app| will try to unwrap
paragraphs using a configurable, :guilabel:`Line Un-Wrapping Factor`. This is a scale used to determine the length
at which a line should be unwrapped. Valid values are a decimal
between 0 and 1. The default is 0.5, this is the median line length. Lower this value to include more
text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under PDF Input.
between 0 and 1. The default is 0.45, just under the median line length. Lower this value to include more
text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
Also, they often have headers and footers as part of the document that will become included with the text.
Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
removed from the text it can throw off the paragraph unwrapping.
removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read
:ref:`regexptutorial`.
Some limitations of PDF input is complex, multi-column, and image based documents are not supported.
Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to
represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are
represented internally in the PDF.
Some limitations of PDF input are:
* Complex, multi-column, and image based documents are not supported.
* Extraction of vector images and tables from within the document is also not supported.
* Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
* Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well.
To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an
output ranging anywhere from decent to unusable, depending on the input PDF.
Comic Book Collections
~~~~~~~~~~~~~~~~~~~~~~~~~