From 0e18837cb6a1baa7f2219d2ca8ec409a66508105 Mon Sep 17 00:00:00 2001
From: Hiroshi Miura
Date: Wed, 5 Jan 2011 23:17:38 +0900
Subject: [PATCH 01/44] fix encoding, reflect change of the site
---
resources/recipes/cnetjapan.recipe | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/resources/recipes/cnetjapan.recipe b/resources/recipes/cnetjapan.recipe
index 1058b90401..b57bce5b97 100644
--- a/resources/recipes/cnetjapan.recipe
+++ b/resources/recipes/cnetjapan.recipe
@@ -11,7 +11,7 @@ class CNetJapan(BasicNewsRecipe):
(u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
]
language = 'ja'
- encoding = 'Shift_JIS'
+ encoding = 'utf-8'
remove_javascript = True
preprocess_regexps = [
From 90f973e7ac999bd7d143b5152cd9fcbe2e9c1f3a Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Thu, 6 Jan 2011 13:30:40 +0000
Subject: [PATCH 02/44] Improve performance of get_metadata. Up to 4 times
faster when connecting to a device
---
src/calibre/ebooks/metadata/book/base.py | 14 +++--
src/calibre/gui2/device.py | 20 +++---
src/calibre/library/custom_columns.py | 4 +-
src/calibre/library/database2.py | 78 +++++++++++++-----------
src/calibre/library/field_metadata.py | 15 +++++
5 files changed, 79 insertions(+), 52 deletions(-)
diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py
index 17f2c6705c..799bdef8e6 100644
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@@ -324,14 +324,16 @@ class Metadata(object):
if metadata is None:
traceback.print_stack()
return
- metadata = copy.deepcopy(metadata)
- if '#value#' not in metadata:
- if metadata['datatype'] == 'text' and metadata['is_multiple']:
- metadata['#value#'] = []
+ m = {}
+ for k in metadata:
+ m[k] = copy.copy(metadata[k])
+ if '#value#' not in m:
+ if m['datatype'] == 'text' and m['is_multiple']:
+ m['#value#'] = []
else:
- metadata['#value#'] = None
+ m['#value#'] = None
_data = object.__getattribute__(self, '_data')
- _data['user_metadata'][field] = metadata
+ _data['user_metadata'][field] = m
def template_to_attribute(self, other, ops):
'''
diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py
index 6d289a3e5c..944ce03305 100644
--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@@ -637,7 +637,7 @@ class DeviceMixin(object): # {{{
self.device_manager.mount_device(kls=FOLDER_DEVICE, kind='folder', path=dir)
def connect_to_bambook(self):
- self.device_manager.mount_device(kls=BAMBOOKWifi, kind='bambook',
+ self.device_manager.mount_device(kls=BAMBOOKWifi, kind='bambook',
path=BAMBOOK.settings().extra_customization)
def connect_to_itunes(self):
@@ -1266,8 +1266,8 @@ class DeviceMixin(object): # {{{
# Force a reset if the caches are not initialized
if reset or not hasattr(self, 'db_book_title_cache'):
# Build a cache (map) of the library, so the search isn't On**2
- self.db_book_title_cache = {}
- self.db_book_uuid_cache = {}
+ db_book_title_cache = {}
+ db_book_uuid_cache = {}
# It might be possible to get here without having initialized the
# library view. In this case, simply give up
try:
@@ -1278,8 +1278,8 @@ class DeviceMixin(object): # {{{
for id in db.data.iterallids():
mi = db.get_metadata(id, index_is_id=True)
title = clean_string(mi.title)
- if title not in self.db_book_title_cache:
- self.db_book_title_cache[title] = \
+ if title not in db_book_title_cache:
+ db_book_title_cache[title] = \
{'authors':{}, 'author_sort':{}, 'db_ids':{}}
# If there are multiple books in the library with the same title
# and author, then remember the last one. That is OK, because as
@@ -1287,12 +1287,14 @@ class DeviceMixin(object): # {{{
# as another.
if mi.authors:
authors = clean_string(authors_to_string(mi.authors))
- self.db_book_title_cache[title]['authors'][authors] = mi
+ db_book_title_cache[title]['authors'][authors] = mi
if mi.author_sort:
aus = clean_string(mi.author_sort)
- self.db_book_title_cache[title]['author_sort'][aus] = mi
- self.db_book_title_cache[title]['db_ids'][mi.application_id] = mi
- self.db_book_uuid_cache[mi.uuid] = mi
+ db_book_title_cache[title]['author_sort'][aus] = mi
+ db_book_title_cache[title]['db_ids'][mi.application_id] = mi
+ db_book_uuid_cache[mi.uuid] = mi
+ self.db_book_title_cache = db_book_title_cache
+ self.db_book_uuid_cache = db_book_uuid_cache
# Now iterate through all the books on the device, setting the
# in_library field. If the UUID matches a book in the library, then
diff --git a/src/calibre/library/custom_columns.py b/src/calibre/library/custom_columns.py
index ba218c3ecc..d925f7c91d 100644
--- a/src/calibre/library/custom_columns.py
+++ b/src/calibre/library/custom_columns.py
@@ -195,8 +195,8 @@ class CustomColumns(object):
data = self.custom_column_num_map[num]
row = self.data._data[idx] if index_is_id else self.data[idx]
ans = row[self.FIELD_MAP[data['num']]]
- if data['is_multiple'] and data['datatype'] == 'text':
- ans = ans.split('|') if ans else []
+ if ans and data['is_multiple'] and data['datatype'] == 'text':
+ ans = ans.split('|')
if data['display'].get('sort_alpha', False):
ans.sort(cmp=lambda x,y:cmp(x.lower(), y.lower()))
return ans
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 611aa1cc89..96f21b88ee 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -256,7 +256,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
'pubdate',
'flags',
'uuid',
- 'has_cover'
+ 'has_cover',
+ '''(SELECT group_concat(authors.name || ':::' || authors.sort, ':#:')
+ FROM authors, books_authors_link as bl
+ WHERE bl.book=books.id and authors.id=bl.author
+ ORDER BY bl.id) au_map''',
+ '(SELECT group_concat(format) FROM data WHERE book=books.id) formats'
]
lines = []
for col in columns:
@@ -275,7 +280,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8,
'publisher':9, 'series_index':10,
'sort':11, 'author_sort':12, 'formats':13, 'isbn':14, 'path':15,
- 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19, 'cover':20}
+ 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19, 'cover':20,
+ 'au_map':21, 'formats':22}
for k,v in self.FIELD_MAP.iteritems():
self.field_metadata.set_field_record_index(k, v, prefer_custom=False)
@@ -687,9 +693,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
Convenience method to return metadata as a :class:`Metadata` object.
Note that the list of formats is not verified.
'''
+ row = self.data._data[idx] if index_is_id else self.data[idx]
+ fm = self.FIELD_MAP
+
self.gm_count += 1
- mi = self.data.get(idx, self.FIELD_MAP['all_metadata'],
- row_is_id = index_is_id)
+ mi = row[self.FIELD_MAP['all_metadata']]
if mi is not None:
if get_cover:
# Always get the cover, because the value can be wrong if the
@@ -699,49 +707,47 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.gm_missed += 1
mi = Metadata(None)
- self.data.set(idx, self.FIELD_MAP['all_metadata'], mi,
- row_is_id = index_is_id)
+ self.data.set(idx, fm['all_metadata'], mi, row_is_id = index_is_id)
- aut_list = self.authors_with_sort_strings(idx, index_is_id=index_is_id)
+ aut_list = row[fm['au_map']]
+ aut_list = [p.split(':::') for p in aut_list.split(':#:')]
aum = []
aus = {}
for (author, author_sort) in aut_list:
aum.append(author)
aus[author] = author_sort
- mi.title = self.title(idx, index_is_id=index_is_id)
+ mi.title = row[fm['title']]
mi.authors = aum
- mi.author_sort = self.author_sort(idx, index_is_id=index_is_id)
+ mi.author_sort = row[fm['author_sort']]
mi.author_sort_map = aus
- mi.comments = self.comments(idx, index_is_id=index_is_id)
- mi.publisher = self.publisher(idx, index_is_id=index_is_id)
- mi.timestamp = self.timestamp(idx, index_is_id=index_is_id)
- mi.pubdate = self.pubdate(idx, index_is_id=index_is_id)
- mi.uuid = self.uuid(idx, index_is_id=index_is_id)
- mi.title_sort = self.title_sort(idx, index_is_id=index_is_id)
- mi.formats = self.formats(idx, index_is_id=index_is_id,
- verify_formats=False)
- if hasattr(mi.formats, 'split'):
- mi.formats = mi.formats.split(',')
+ mi.comments = row[fm['comments']]
+ mi.publisher = row[fm['publisher']]
+ mi.timestamp = row[fm['timestamp']]
+ mi.pubdate = row[fm['pubdate']]
+ mi.uuid = row[fm['uuid']]
+ mi.title_sort = row[fm['sort']]
+ formats = row[fm['formats']]
+ if hasattr(formats, 'split'):
+ mi.formats = formats.split(',')
else:
mi.formats = None
- tags = self.tags(idx, index_is_id=index_is_id)
+ tags = row[fm['tags']]
if tags:
mi.tags = [i.strip() for i in tags.split(',')]
- mi.series = self.series(idx, index_is_id=index_is_id)
+ mi.series = row[fm['series']]
if mi.series:
- mi.series_index = self.series_index(idx, index_is_id=index_is_id)
- mi.rating = self.rating(idx, index_is_id=index_is_id)
- mi.isbn = self.isbn(idx, index_is_id=index_is_id)
+ mi.series_index = row[fm['series_index']]
+ mi.rating = row[fm['rating']]
+ mi.isbn = row[fm['isbn']]
id = idx if index_is_id else self.id(idx)
mi.application_id = id
mi.id = id
- for key,meta in self.field_metadata.iteritems():
- if meta['is_custom']:
- mi.set_user_metadata(key, meta)
- mi.set(key, val=self.get_custom(idx, label=meta['label'],
- index_is_id=index_is_id),
- extra=self.get_custom_extra(idx, label=meta['label'],
- index_is_id=index_is_id))
+ for key,meta in self.field_metadata.custom_iteritems():
+ mi.set_user_metadata(key, meta)
+ mi.set(key, val=self.get_custom(idx, label=meta['label'],
+ index_is_id=index_is_id),
+ extra=self.get_custom_extra(idx, label=meta['label'],
+ index_is_id=index_is_id))
if get_cover:
mi.cover = self.cover(id, index_is_id=True, as_path=True)
return mi
@@ -878,10 +884,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def formats(self, index, index_is_id=False, verify_formats=True):
''' Return available formats as a comma separated list or None if there are no available formats '''
id = index if index_is_id else self.id(index)
- try:
- formats = self.conn.get('SELECT format FROM data WHERE book=?', (id,))
- formats = map(lambda x:x[0], formats)
- except:
+ formats = self.data.get(id, self.FIELD_MAP['formats'], row_is_id = True)
+ if not formats:
return None
if not verify_formats:
return ','.join(formats)
@@ -1607,6 +1611,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
','.join([a.replace(',', '|') for a in authors]),
row_is_id=True)
self.data.set(id, self.FIELD_MAP['author_sort'], ss, row_is_id=True)
+ aum = self.authors_with_sort_strings(id, index_is_id=True)
+ self.data.set(id, self.FIELD_MAP['au_map'],
+ ':#:'.join([':::'.join((au.replace(',', '|'), aus)) for (au, aus) in aum]),
+ row_is_id=True)
def set_authors(self, id, authors, notify=True, commit=True):
'''
diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py
index 1be6604d5d..676eb13d2b 100644
--- a/src/calibre/library/field_metadata.py
+++ b/src/calibre/library/field_metadata.py
@@ -180,6 +180,15 @@ class FieldMetadata(dict):
'search_terms':['author_sort'],
'is_custom':False,
'is_category':False}),
+ ('au_map', {'table':None,
+ 'column':None,
+ 'datatype':'text',
+ 'is_multiple':',',
+ 'kind':'field',
+ 'name':None,
+ 'search_terms':[],
+ 'is_custom':False,
+ 'is_category':False}),
('comments', {'table':None,
'column':None,
'datatype':'text',
@@ -400,6 +409,12 @@ class FieldMetadata(dict):
for key in self._tb_cats:
yield (key, self._tb_cats[key])
+ def custom_iteritems(self):
+ for key in self._tb_cats:
+ fm = self._tb_cats[key]
+ if fm['is_custom']:
+ yield (key, self._tb_cats[key])
+
def items(self):
return list(self.iteritems())
From ae759ebd109d1d4040a54ee676bd203070bc8f1f Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Thu, 6 Jan 2011 19:35:04 +0000
Subject: [PATCH 03/44] Fix ticket #8214 (Detect 'senseless' user input)
---
src/calibre/gui2/preferences/save_template.py | 8 ++++++--
src/calibre/utils/formatter.py | 2 +-
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/src/calibre/gui2/preferences/save_template.py b/src/calibre/gui2/preferences/save_template.py
index a7f57536d5..4c00a14c0f 100644
--- a/src/calibre/gui2/preferences/save_template.py
+++ b/src/calibre/gui2/preferences/save_template.py
@@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
from PyQt4.Qt import QWidget, pyqtSignal
-from calibre.gui2 import error_dialog
+from calibre.gui2 import error_dialog, question_dialog
from calibre.gui2.preferences.save_template_ui import Ui_Form
from calibre.library.save_to_disk import FORMAT_ARG_DESCS, preprocess_template
from calibre.utils.formatter import validation_formatter
@@ -52,7 +52,11 @@ class SaveTemplate(QWidget, Ui_Form):
'''
tmpl = preprocess_template(self.opt_template.text())
try:
- validation_formatter.validate(tmpl)
+ t = validation_formatter.validate(tmpl)
+ if t.find(validation_formatter._validation_string) < 0:
+ return question_dialog(self, _('Constant template'),
+ _('The template contains no {fields}, so all '
+ 'books will have the same name. Is this OK?'))
except Exception, err:
error_dialog(self, _('Invalid template'),
''+_('The template %s is invalid:')%tmpl + \
diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index f4e687b419..23763a25bf 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -453,7 +453,7 @@ class TemplateFormatter(string.Formatter):
class ValidateFormatter(TemplateFormatter):
'''
- Provides a format function that substitutes '' for any missing value
+ Provides a formatter that substitutes the validation string for every value
'''
def get_value(self, key, args, kwargs):
return self._validation_string
From 2cdbc5ea1195038fc55ab1f3fd1c10fba7a5545d Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Thu, 6 Jan 2011 21:58:07 +0000
Subject: [PATCH 04/44] Use sort_concat instead of group_concat for the au_map
meta2 field
---
src/calibre/library/database2.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 96f21b88ee..0f3a1a72fa 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -257,7 +257,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
'flags',
'uuid',
'has_cover',
- '''(SELECT group_concat(authors.name || ':::' || authors.sort, ':#:')
+ '''(SELECT sortconcat(bl.id, authors.name || ':::' || REPLACE(authors.sort, ',','|'))
FROM authors, books_authors_link as bl
WHERE bl.book=books.id and authors.id=bl.author
ORDER BY bl.id) au_map''',
@@ -710,12 +710,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.data.set(idx, fm['all_metadata'], mi, row_is_id = index_is_id)
aut_list = row[fm['au_map']]
- aut_list = [p.split(':::') for p in aut_list.split(':#:')]
+ aut_list = [p.split(':::') for p in aut_list.split(',')]
aum = []
aus = {}
for (author, author_sort) in aut_list:
aum.append(author)
- aus[author] = author_sort
+ aus[author] = author_sort.replace('|', ',')
mi.title = row[fm['title']]
mi.authors = aum
mi.author_sort = row[fm['author_sort']]
From cb55c9a4f9fc0bc1cbe71afb2082a657100730d0 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Fri, 7 Jan 2011 08:12:05 +0000
Subject: [PATCH 05/44] Add another motorola droid...
---
src/calibre/devices/android/driver.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py
index c2db8ddd77..b7e2f0fd2e 100644
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@@ -27,8 +27,9 @@ class ANDROID(USBMS):
0x040d : { 0x8510 : [0x0001], 0x0851 : [0x1] },
# Motorola
- 0x22b8 : { 0x41d9 : [0x216], 0x2d61: [0x100], 0x2d67 : [0x100],
- 0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216] },
+ 0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
+ 0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
+ 0x4286 : [0x216] },
# Sony Ericsson
0xfce : { 0xd12e : [0x0100]},
From 9e82bd6f238145788ec8a58288f1f9dfafb9365e Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Fri, 7 Jan 2011 08:25:21 +0000
Subject: [PATCH 06/44] Fix problem with formatter intepreting a missing format
letter as ERROR instead of 's'.
---
src/calibre/utils/formatter.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index 23763a25bf..46b52b9ce5 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -316,8 +316,6 @@ class TemplateFormatter(string.Formatter):
except:
raise ValueError(
_('format: type {0} requires a decimal (float) value, got {1}').format(typ, val))
- else:
- raise ValueError(_('format: unknown format type letter {0}').format(typ))
return unicode(('{0:'+fmt+'}').format(val))
def _explode_format_string(self, fmt):
From 21392dc27caa3f450047b4aaa127ba5ca9448cde Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Fri, 7 Jan 2011 10:45:06 +0000
Subject: [PATCH 07/44] 1) add a special aggragate class to build the au_map.
2) remove the extra format column (should never have been added)
---
src/calibre/library/database2.py | 15 +++++----------
src/calibre/library/sqlite.py | 18 ++++++++++++++++++
2 files changed, 23 insertions(+), 10 deletions(-)
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 0f3a1a72fa..8fef5d36bc 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -257,11 +257,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
'flags',
'uuid',
'has_cover',
- '''(SELECT sortconcat(bl.id, authors.name || ':::' || REPLACE(authors.sort, ',','|'))
- FROM authors, books_authors_link as bl
- WHERE bl.book=books.id and authors.id=bl.author
- ORDER BY bl.id) au_map''',
- '(SELECT group_concat(format) FROM data WHERE book=books.id) formats'
+ ('au_map', 'authors', 'author', 'aum_sortconcat(link.id, authors.name, authors.sort)')
]
lines = []
for col in columns:
@@ -278,10 +274,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.FIELD_MAP = {'id':0, 'title':1, 'authors':2, 'timestamp':3,
'size':4, 'rating':5, 'tags':6, 'comments':7, 'series':8,
- 'publisher':9, 'series_index':10,
- 'sort':11, 'author_sort':12, 'formats':13, 'isbn':14, 'path':15,
- 'lccn':16, 'pubdate':17, 'flags':18, 'uuid':19, 'cover':20,
- 'au_map':21, 'formats':22}
+ 'publisher':9, 'series_index':10, 'sort':11, 'author_sort':12,
+ 'formats':13, 'isbn':14, 'path':15, 'lccn':16, 'pubdate':17,
+ 'flags':18, 'uuid':19, 'cover':20, 'au_map':21}
for k,v in self.FIELD_MAP.iteritems():
self.field_metadata.set_field_record_index(k, v, prefer_custom=False)
@@ -710,7 +705,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.data.set(idx, fm['all_metadata'], mi, row_is_id = index_is_id)
aut_list = row[fm['au_map']]
- aut_list = [p.split(':::') for p in aut_list.split(',')]
+ aut_list = [p.split(':::') for p in aut_list.split(':#:')]
aum = []
aus = {}
for (author, author_sort) in aut_list:
diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py
index 0458ada27b..75856dd0f6 100644
--- a/src/calibre/library/sqlite.py
+++ b/src/calibre/library/sqlite.py
@@ -87,6 +87,23 @@ class SortedConcatenate(object):
class SafeSortedConcatenate(SortedConcatenate):
sep = '|'
+class AumSortedConcatenate(object):
+ '''String concatenation aggregator for the author sort map'''
+ def __init__(self):
+ self.ans = {}
+
+ def step(self, ndx, author, sort):
+ if author is not None:
+ self.ans[ndx] = author + ':::' + sort
+
+ def finalize(self):
+ keys = self.ans.keys()
+ if len(keys) == 0:
+ return None
+ if len(keys) == 1:
+ return self.ans[keys[0]]
+ return ':#:'.join([self.ans[v] for v in sorted(keys)])
+
class Connection(sqlite.Connection):
def get(self, *args, **kw):
@@ -155,6 +172,7 @@ class DBThread(Thread):
c_ext_loaded = load_c_extensions(self.conn)
self.conn.row_factory = sqlite.Row if self.row_factory else lambda cursor, row : list(row)
self.conn.create_aggregate('concat', 1, Concatenate)
+ self.conn.create_aggregate('aum_sortconcat', 3, AumSortedConcatenate)
if not c_ext_loaded:
self.conn.create_aggregate('sortconcat', 2, SortedConcatenate)
self.conn.create_aggregate('sort_concat', 2, SafeSortedConcatenate)
From e1a5bef8967c18c773b506b9c29a9a8849cb8b37 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Fri, 7 Jan 2011 12:41:48 +0000
Subject: [PATCH 08/44] Fix scrolling to column zero when clicking on a column
to sort.
---
src/calibre/gui2/library/views.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/gui2/library/views.py b/src/calibre/gui2/library/views.py
index c1dd5b3766..e1e9cf4456 100644
--- a/src/calibre/gui2/library/views.py
+++ b/src/calibre/gui2/library/views.py
@@ -612,7 +612,7 @@ class BooksView(QTableView): # {{{
if row > -1:
h = self.horizontalHeader()
for i in range(h.count()):
- if not h.isSectionHidden(i):
+ if not h.isSectionHidden(i) and h.sectionViewportPosition(i) >= 0:
self.scrollTo(self.model().index(row, i))
break
From 910a3f1accbb8692a8dbbf75614a5d826d515493 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Fri, 7 Jan 2011 13:22:58 +0000
Subject: [PATCH 09/44] Add the list_item function to the template processor,
to avoid using complicated regexp to carve them out. Add it to the
documentation.
---
src/calibre/manual/template_lang.rst | 1 +
src/calibre/utils/formatter.py | 15 +++++++++++++--
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/src/calibre/manual/template_lang.rst b/src/calibre/manual/template_lang.rst
index 1bf08c11f9..f64a413d3e 100644
--- a/src/calibre/manual/template_lang.rst
+++ b/src/calibre/manual/template_lang.rst
@@ -121,6 +121,7 @@ The functions available are:
* ``contains(pattern, text if match, text if not match`` -- checks if field contains matches for the regular expression `pattern`. Returns `text if match` if matches are found, otherwise it returns `text if no match`.
* ``count(separator)`` -- interprets the value as a list of items separated by `separator`, returning the number of items in the list. Most lists use a comma as the separator, but authors uses an ampersand. Examples: `{tags:count(,)}`, `{authors:count(&)}`
* ``ifempty(text)`` -- if the field is not empty, return the value of the field. Otherwise return `text`.
+ * ``list_item(index, separator)`` -- interpret the value as a list of items separated by `separator`, returning the `index`th item. The first item is number zero. The last item can be returned using `list_item(-1,separator)`. If the item is not in the list, then the empty value is returned. The separator has the same meaning as in the `count` function.
* ``lookup(pattern, field, pattern, field, ..., else_field)`` -- like switch, except the arguments are field (metadata) names, not text. The value of the appropriate field will be fetched and used. Note that because composite columns are fields, you can use this function in one composite field to use the value of some other composite field. This is extremely useful when constructing variable save paths (more later).
* ``re(pattern, replacement)`` -- return the field after applying the regular expression. All instances of `pattern` are replaced with `replacement`. As in all of |app|, these are python-compatible regular expressions.
* ``shorten(left chars, middle text, right chars)`` -- Return a shortened version of the field, consisting of `left chars` characters from the beginning of the field, followed by `middle text`, followed by `right chars` characters from the end of the string. `Left chars` and `right chars` must be integers. For example, assume the title of the book is `Ancient English Laws in the Times of Ivanhoe`, and you want it to fit in a space of at most 15 characters. If you use ``{title:shorten(9,-,5)}``, the result will be `Ancient E-nhoe`. If the field's length is less than ``left chars`` + ``right chars`` + the length of ``middle text``, then the field will be used intact. For example, the title `The Dome` would not be changed.
diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index 46b52b9ce5..2e4f843c3d 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -281,19 +281,30 @@ class TemplateFormatter(string.Formatter):
def _count(self, val, sep):
return unicode(len(val.split(sep)))
+ def _list_item(self, val, index, sep):
+ if not val:
+ return ''
+ index = int(index)
+ val = val.split(sep)
+ try:
+ return val[index]
+ except:
+ return ''
+
functions = {
'uppercase' : (0, lambda s,x: x.upper()),
'lowercase' : (0, lambda s,x: x.lower()),
'titlecase' : (0, lambda s,x: titlecase(x)),
'capitalize' : (0, lambda s,x: capitalize(x)),
'contains' : (3, _contains),
+ 'count' : (1, _count),
'ifempty' : (1, _ifempty),
+ 'list_item' : (2, _list_item),
'lookup' : (-1, _lookup),
're' : (2, _re),
'shorten' : (3, _shorten),
'switch' : (-1, _switch),
- 'test' : (2, _test),
- 'count' : (1, _count),
+ 'test' : (2, _test)
}
def _do_format(self, val, fmt):
From b0a3912867f44e68db7c00d3bb2cb8149c4f1884 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Fri, 7 Jan 2011 15:25:56 +0000
Subject: [PATCH 10/44] Optimized search, doing and/or/not shortcutting
---
src/calibre/library/caches.py | 23 ++++---
src/calibre/utils/search_query_parser.py | 78 ++++++++++++++++++------
2 files changed, 72 insertions(+), 29 deletions(-)
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index 980c9f1fa9..d56111b30a 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -181,7 +181,7 @@ class ResultCache(SearchQueryParser): # {{{
self.search_restriction = ''
self.field_metadata = field_metadata
self.all_search_locations = field_metadata.get_search_terms()
- SearchQueryParser.__init__(self, self.all_search_locations)
+ SearchQueryParser.__init__(self, self.all_search_locations, optimize=True)
self.build_date_relop_dict()
self.build_numeric_relop_dict()
@@ -264,7 +264,7 @@ class ResultCache(SearchQueryParser): # {{{
'<=':[2, relop_le]
}
- def get_dates_matches(self, location, query):
+ def get_dates_matches(self, location, query, candidates):
matches = set([])
if len(query) < 2:
return matches
@@ -274,13 +274,13 @@ class ResultCache(SearchQueryParser): # {{{
loc = self.field_metadata[location]['rec_index']
if query == 'false':
- for item in self._data:
+ for item in [self._data[id] for id in candidates]:
if item is None: continue
if item[loc] is None or item[loc] <= UNDEFINED_DATE:
matches.add(item[0])
return matches
if query == 'true':
- for item in self._data:
+ for item in [self._data[id] for id in candidates]:
if item is None: continue
if item[loc] is not None and item[loc] > UNDEFINED_DATE:
matches.add(item[0])
@@ -319,7 +319,7 @@ class ResultCache(SearchQueryParser): # {{{
field_count = query.count('-') + 1
else:
field_count = query.count('/') + 1
- for item in self._data:
+ for item in [self._data[id] for id in candidates]:
if item is None or item[loc] is None: continue
if relop(item[loc], qd, field_count):
matches.add(item[0])
@@ -335,7 +335,7 @@ class ResultCache(SearchQueryParser): # {{{
'<=':[2, lambda r, q: r <= q]
}
- def get_numeric_matches(self, location, query, val_func = None):
+ def get_numeric_matches(self, location, query, candidates, val_func = None):
matches = set([])
if len(query) == 0:
return matches
@@ -381,7 +381,7 @@ class ResultCache(SearchQueryParser): # {{{
except:
return matches
- for item in self._data:
+ for item in [self._data[id] for id in candidates]:
if item is None:
continue
v = val_func(item)
@@ -393,8 +393,13 @@ class ResultCache(SearchQueryParser): # {{{
matches.add(item[0])
return matches
- def get_matches(self, location, query, allow_recursion=True):
+ def get_matches(self, location, query, allow_recursion=True, candidates=None):
matches = set([])
+ if candidates is None:
+ candidates = self.universal_set()
+ if len(candidates) == 0:
+ return matches
+
if query and query.strip():
# get metadata key associated with the search term. Eliminates
# dealing with plurals and other aliases
@@ -476,7 +481,7 @@ class ResultCache(SearchQueryParser): # {{{
else:
q = query
- for item in self._data:
+ for item in [self._data[id] for id in candidates]:
if item is None: continue
if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak
diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py
index db7c7bde5f..4f2c899bce 100644
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@@ -118,8 +118,9 @@ class SearchQueryParser(object):
failed.append(test[0])
return failed
- def __init__(self, locations, test=False):
+ def __init__(self, locations, test=False, optimize=False):
self._tests_failed = False
+ self.optimize = optimize
# Define a token
standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
locations)
@@ -182,38 +183,50 @@ class SearchQueryParser(object):
# empty the list of searches used for recursion testing
self.recurse_level = 0
self.searches_seen = set([])
- return self._parse(query)
+ candidates = self.universal_set()
+ return self._parse(query, candidates)
# this parse is used internally because it doesn't clear the
# recursive search test list. However, we permit seeing the
# same search a few times because the search might appear within
# another search.
- def _parse(self, query):
+ def _parse(self, query, candidates):
self.recurse_level += 1
res = self._parser.parseString(query)[0]
- t = self.evaluate(res)
+ t = self.evaluate(res, candidates)
self.recurse_level -= 1
return t
def method(self, group_name):
return getattr(self, 'evaluate_'+group_name)
- def evaluate(self, parse_result):
- return self.method(parse_result.getName())(parse_result)
+ def evaluate(self, parse_result, candidates):
+ return self.method(parse_result.getName())(parse_result, candidates)
- def evaluate_and(self, argument):
- return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
+ def evaluate_and(self, argument, candidates):
+ # RHS checks only those items matched by LHS
+ # returns result of RHS check: RHmatches(LHmatches(c))
+ # return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
+ l = self.evaluate(argument[0], candidates)
+ return l.intersection(self.evaluate(argument[1], l))
- def evaluate_or(self, argument):
- return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
+ def evaluate_or(self, argument, candidates):
+ # RHS checks only those elements not matched by LHS
+ # returns LHS union RHS: LHmatches(c) + RHmatches(c-LHmatches(c))
+ # return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
+ l = self.evaluate(argument[0], candidates)
+ return l.union(self.evaluate(argument[1], candidates.difference(l)))
- def evaluate_not(self, argument):
- return self.universal_set().difference(self.evaluate(argument[0]))
+ def evaluate_not(self, argument, candidates):
+ # unary op checks only candidates. Result: list of items matching
+ # returns: c - matches(c)
+ # return self.universal_set().difference(self.evaluate(argument[0]))
+ return candidates.difference(self.evaluate(argument[0], candidates))
- def evaluate_parenthesis(self, argument):
- return self.evaluate(argument[0])
+ def evaluate_parenthesis(self, argument, candidates):
+ return self.evaluate(argument[0], candidates)
- def evaluate_token(self, argument):
+ def evaluate_token(self, argument, candidates):
location = argument[0]
query = argument[1]
if location.lower() == 'search':
@@ -224,17 +237,26 @@ class SearchQueryParser(object):
raise ParseException(query, len(query), 'undefined saved search', self)
if self.recurse_level > 5:
self.searches_seen.add(query)
- return self._parse(saved_searches().lookup(query))
+ return self._parse(saved_searches().lookup(query), candidates)
except: # convert all exceptions (e.g., missing key) to a parse error
raise ParseException(query, len(query), 'undefined saved search', self)
- return self.get_matches(location, query)
+ return self._get_matches(location, query, candidates)
+
+ def _get_matches(self, location, query, candidates):
+ if self.optimize:
+ return self.get_matches(location, query, candidates=candidates)
+ else:
+ return self.get_matches(location, query)
def get_matches(self, location, query):
'''
Should return the set of matches for :param:'location` and :param:`query`.
+ If you set the optimized parameter in __init__, this method must accept
+ a named parameter 'candidates'
:param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
:param:`query` is a string literal.
+ :param: optional named parameter candidates, a set of items to check.
'''
return set([])
@@ -561,7 +583,7 @@ class Tester(SearchQueryParser):
def universal_set(self):
return self._universal_set
- def get_matches(self, location, query):
+ def get_matches(self, location, query, candidates=None):
location = location.lower()
if location in self.fields.keys():
getter = operator.itemgetter(self.fields[location])
@@ -573,8 +595,13 @@ class Tester(SearchQueryParser):
if not query:
return set([])
query = query.lower()
- return set(key for key, val in self.texts.items() \
- if query and query in getattr(getter(val), 'lower', lambda : '')())
+ if candidates:
+ return set(key for key, val in self.texts.items() \
+ if key in candidates and query and query
+ in getattr(getter(val), 'lower', lambda : '')())
+ else:
+ return set(key for key, val in self.texts.items() \
+ if query and query in getattr(getter(val), 'lower', lambda : '')())
@@ -592,6 +619,7 @@ class Tester(SearchQueryParser):
def main(args=sys.argv):
+ print 'testing unoptimized'
tester = Tester(['authors', 'author', 'series', 'formats', 'format',
'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
@@ -601,6 +629,16 @@ def main(args=sys.argv):
print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
return 1
+ print '\n\ntesting optimized'
+ tester = Tester(['authors', 'author', 'series', 'formats', 'format',
+ 'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
+ 'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
+ 'all', 'search'], test=True, optimize=True)
+ failed = tester.run_tests()
+ if tester._tests_failed or failed:
+ print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
+ return 1
+
return 0
if __name__ == '__main__':
From 9bbff15c27c2be0b6101f17ddaa7f53a504824ea Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 02:12:09 +0800
Subject: [PATCH 11/44] text processing tweaks
---
src/calibre/ebooks/conversion/utils.py | 4 ++--
src/calibre/ebooks/txt/input.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 27dacdf5fb..52d1bcc619 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -190,7 +190,7 @@ class PreProcessor(object):
line_ending = "\s*(span|p|div)>\s*((p|span|div)>)?"
blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
- txt_line_wrap = u"(\u0020|\u0009)*\n"
+ txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt':
@@ -357,6 +357,6 @@ class PreProcessor(object):
html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
# Center separator lines
- html = re.sub(u'\s*(?P([*#•]+\s*)+)\s*
', '' + '\g' + '
', html)
+ html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 98756c5fa1..eac46385a7 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin):
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
- if options.paragraph_type == 'single' or 'unformatted':
+ if options.paragraph_type in ('single', 'unformatted'):
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
From 843e1f2068cf1707f7f002be7c05c37282e9fa36 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 8 Jan 2011 13:17:32 -0500
Subject: [PATCH 12/44] TXT Input: Basic heuristic processor.
---
src/calibre/ebooks/txt/heuristicprocessor.py | 88 ++++++++++++++++++++
src/calibre/ebooks/txt/input.py | 12 ++-
src/calibre/ebooks/txt/processor.py | 23 ++++-
3 files changed, 116 insertions(+), 7 deletions(-)
create mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
new file mode 100644
index 0000000000..cbfa33a96a
--- /dev/null
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember '
+__docformat__ = 'restructuredtext en'
+
+import re
+import string
+
+from calibre import prepare_string_for_xml
+from calibre.ebooks.unidecode.unidecoder import Unidecoder
+
+class TXTHeuristicProcessor(object):
+
+ def __init__(self):
+ self.ITALICIZE_WORDS = [
+ 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+ 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+ 'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+ 'Mlle.', 'Mons.', 'PS.', 'PPS.',
+ ]
+ self.ITALICIZE_STYLE_PATS = [
+ r'(?msu)_(?P.+?)_',
+ r'(?msu)/(?P.+?)/',
+ r'(?msu)~~(?P.+?)~~',
+ r'(?msu)\*(?P.+?)\*',
+ r'(?msu)~(?P.+?)~',
+ r'(?msu)_/(?P.+?)/_',
+ r'(?msu)_\*(?P.+?)\*_',
+ r'(?msu)\*/(?P.+?)/\*',
+ r'(?msu)_\*/(?P.+?)/\*_',
+ r'(?msu)/:(?P.+?):/',
+ r'(?msu)\|:(?P.+?):\|',
+ ]
+
+ def del_maketrans(self, deletechars):
+ return dict([(ord(x), u'') for x in deletechars])
+
+ def is_heading(self, line):
+ if not line:
+ return False
+ if len(line) > 40:
+ return False
+
+ line = Unidecoder().decode(line)
+
+ # punctuation.
+ if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
+ return False
+
+ # All upper case.
+ #if line.isupper():
+ # return True
+ # Roman numerals.
+ #if not line.translate(self.del_maketrans('IVXYCivxyc ')):
+ # return True
+
+ return True
+
+ def process_paragraph(self, paragraph):
+ for word in self.ITALICIZE_WORDS:
+ paragraph = paragraph.replace(word, '%s' % word)
+ for pat in self.ITALICIZE_STYLE_PATS:
+ paragraph = re.sub(pat, lambda mo: '%s' % mo.group('words'), paragraph)
+ return paragraph
+
+ def convert(self, txt, title='', epub_split_size_kb=0):
+ from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
+ txt = clean_txt(txt)
+ txt = split_txt(txt, epub_split_size_kb)
+
+ processed = []
+ last_was_heading = False
+ for line in txt.split('\n\n'):
+ if self.is_heading(line):
+ if not last_was_heading:
+ processed.append(u'%s
' % prepare_string_for_xml(line.replace('\n', ' ')))
+ else:
+ processed.append(u'%s
' % prepare_string_for_xml(line.replace('\n', ' ')))
+ last_was_heading = True
+ else:
+ processed.append(u'%s
' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
+ last_was_heading = False
+
+ txt = u'\n'.join(processed)
+ txt = re.sub('[ ]{2,}', ' ', txt)
+
+ return HTML_TEMPLATE % (title, txt)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 47e92a45a9..fd805f8ce8 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -10,7 +10,8 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
- preserve_spaces, detect_paragraph_type, detect_formatting_type
+ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
+ convert_heuristic
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@@ -31,7 +32,7 @@ class TXTInput(InputFormatPlugin):
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
- choices=['auto', 'none', 'markdown'],
+ choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
@@ -96,7 +97,12 @@ class TXTInput(InputFormatPlugin):
txt = separate_paragraphs_print_formatted(txt)
flow_size = getattr(options, 'flow_size', 0)
- html = convert_basic(txt, epub_split_size_kb=flow_size)
+
+ if options.formatting_type == 'heuristic':
+ html = convert_heuristic(txt, epub_split_size_kb=flow_size)
+ else:
+ html = convert_basic(txt, epub_split_size_kb=flow_size)
+
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index f6d628e7c5..79eee79c29 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,6 +9,7 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator
+from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember '
@@ -16,7 +17,7 @@ __docformat__ = 'restructuredtext en'
HTML_TEMPLATE = u'%s\n%s\n'
-def convert_basic(txt, title='', epub_split_size_kb=0):
+def clean_txt(txt):
if isbytestring(txt):
txt = txt.decode('utf-8', 'replace')
# Strip whitespace from the beginning and end of the line. Also replace
@@ -35,6 +36,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
txt = illegal_chars.sub('', txt)
+
+ return txt
+
+def split_txt(txt, epub_split_size_kb=0):
#Takes care if there is no point to split
if epub_split_size_kb > 0:
if isinstance(txt, unicode):
@@ -49,6 +54,12 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
if isbytestring(txt):
txt = txt.decode('utf-8')
+ return txt
+
+def convert_basic(txt, title='', epub_split_size_kb=0):
+ txt = clean_txt(txt)
+ txt = split_txt(txt, epub_split_size_kb)
+
lines = []
# Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'):
@@ -57,6 +68,10 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
return HTML_TEMPLATE % (title, u'\n'.join(lines))
+def convert_heuristic(txt, title='', epub_split_size_kb=0):
+ tp = TXTHeuristicProcessor()
+ return tp.convert(txt, title, epub_split_size_kb)
+
def convert_markdown(txt, title='', disable_toc=False):
md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'],
@@ -111,12 +126,12 @@ def detect_paragraph_type(txt):
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
- if tab_line_count / float(txt_line_count) >= .25:
+ if tab_line_count / float(txt_line_count) >= .15:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
- if empty_line_count / float(txt_line_count) >= .25:
+ if empty_line_count / float(txt_line_count) >= .15:
return 'block'
# Nothing else matched to assume single.
@@ -143,4 +158,4 @@ def detect_formatting_type(txt):
if txt.count('\\'+c) > 10:
return 'markdown'
- return 'none'
+ return 'heuristic'
From f593b2163154bcd61e21b0e06f8cf0e29514af86 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 8 Jan 2011 13:53:32 -0500
Subject: [PATCH 13/44] TXT Input: Tweak Heuristic italicizing.
---
src/calibre/ebooks/txt/heuristicprocessor.py | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index cbfa33a96a..b0bbd49961 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -21,15 +21,15 @@ class TXTHeuristicProcessor(object):
]
self.ITALICIZE_STYLE_PATS = [
r'(?msu)_(?P.+?)_',
- r'(?msu)/(?P.+?)/',
+ r'(?msu)/(?P[^<>]+?)/',
r'(?msu)~~(?P.+?)~~',
r'(?msu)\*(?P.+?)\*',
r'(?msu)~(?P.+?)~',
- r'(?msu)_/(?P.+?)/_',
+ r'(?msu)_/(?P[^<>]+?)/_',
r'(?msu)_\*(?P.+?)\*_',
- r'(?msu)\*/(?P.+?)/\*',
- r'(?msu)_\*/(?P.+?)/\*_',
- r'(?msu)/:(?P.+?):/',
+ r'(?msu)\*/(?P[^<>]+?)/\*',
+ r'(?msu)_\*/(?P[^<>]+?)/\*_',
+ r'(?msu)/:(?P[^<>]+?):/',
r'(?msu)\|:(?P.+?):\|',
]
@@ -84,5 +84,6 @@ class TXTHeuristicProcessor(object):
txt = u'\n'.join(processed)
txt = re.sub('[ ]{2,}', ' ', txt)
+ print txt
return HTML_TEMPLATE % (title, txt)
From c8f18ff02e32f56220f83872f4def00cca58e73d Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 8 Jan 2011 15:49:10 -0500
Subject: [PATCH 14/44] TXT Input: Heuristic processor, use PreProcessor to
mark chapter headings.
---
src/calibre/ebooks/txt/heuristicprocessor.py | 43 ++++----------------
src/calibre/ebooks/txt/processor.py | 3 --
2 files changed, 7 insertions(+), 39 deletions(-)
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index b0bbd49961..c4489badc5 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -33,30 +33,6 @@ class TXTHeuristicProcessor(object):
r'(?msu)\|:(?P.+?):\|',
]
- def del_maketrans(self, deletechars):
- return dict([(ord(x), u'') for x in deletechars])
-
- def is_heading(self, line):
- if not line:
- return False
- if len(line) > 40:
- return False
-
- line = Unidecoder().decode(line)
-
- # punctuation.
- if line.translate(self.del_maketrans(string.letters + string.digits + ' :-')):
- return False
-
- # All upper case.
- #if line.isupper():
- # return True
- # Roman numerals.
- #if not line.translate(self.del_maketrans('IVXYCivxyc ')):
- # return True
-
- return True
-
def process_paragraph(self, paragraph):
for word in self.ITALICIZE_WORDS:
paragraph = paragraph.replace(word, '%s' % word)
@@ -70,20 +46,15 @@ class TXTHeuristicProcessor(object):
txt = split_txt(txt, epub_split_size_kb)
processed = []
- last_was_heading = False
for line in txt.split('\n\n'):
- if self.is_heading(line):
- if not last_was_heading:
- processed.append(u'%s
' % prepare_string_for_xml(line.replace('\n', ' ')))
- else:
- processed.append(u'%s
' % prepare_string_for_xml(line.replace('\n', ' ')))
- last_was_heading = True
- else:
- processed.append(u'%s
' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
- last_was_heading = False
+ processed.append(u'%s
' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
txt = u'\n'.join(processed)
txt = re.sub('[ ]{2,}', ' ', txt)
- print txt
+ html = HTML_TEMPLATE % (title, txt)
+
+ from calibre.ebooks.conversion.utils import PreProcessor
+ pp = PreProcessor()
+ html = pp.markup_chapters(html, pp.get_word_count(html), False)
- return HTML_TEMPLATE % (title, txt)
+ return html
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 1e67caccc6..9dc29e45dd 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -9,11 +9,8 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator
-<<<<<<< TREE
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
-=======
from calibre.ebooks.conversion.preprocess import DocAnalysis
->>>>>>> MERGE-SOURCE
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember '
From bd14205637cbf71fe4aad655de50f4f0fea98a60 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 8 Jan 2011 15:53:51 -0500
Subject: [PATCH 15/44] ...
---
src/calibre/ebooks/txt/heuristicprocessor.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
index c4489badc5..c4c6a56123 100644
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@@ -5,7 +5,6 @@ __copyright__ = '2011, John Schember '
__docformat__ = 'restructuredtext en'
import re
-import string
from calibre import prepare_string_for_xml
from calibre.ebooks.unidecode.unidecoder import Unidecoder
@@ -48,7 +47,7 @@ class TXTHeuristicProcessor(object):
processed = []
for line in txt.split('\n\n'):
processed.append(u'%s
' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-
+
txt = u'\n'.join(processed)
txt = re.sub('[ ]{2,}', ' ', txt)
html = HTML_TEMPLATE % (title, txt)
From 831ee1fc81b50d9ccd7c771161db322715fa3092 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 8 Jan 2011 16:53:54 -0500
Subject: [PATCH 16/44] TXT Input: Add documentation for the heuristic
formatting option to the option help.
---
src/calibre/ebooks/txt/input.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5060e124ff..c8ce389574 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -37,6 +37,8 @@ class TXTInput(InputFormatPlugin):
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
+ '* heuristic: Try to detect formatting for elements such as chapter headings '
+ 'and style the elements appropriately.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
From 12cbaa2304db610ccf101bbd4abe13ff58f68fee Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sat, 8 Jan 2011 17:26:32 -0500
Subject: [PATCH 17/44] TXT Input: Make formatting_type options easier to
understand.
---
src/calibre/ebooks/txt/input.py | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index c8ce389574..e782cd0cd9 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -35,11 +35,12 @@ class TXTInput(InputFormatPlugin):
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.'
- '* auto: Try to auto detect the document formatting.\n'
- '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
- '* heuristic: Try to detect formatting for elements such as chapter headings '
- 'and style the elements appropriately.\n'
- '* markdown: Run the input though the markdown pre-processor. '
+ '* auto: Automatically decide which formatting processor to use.\n'
+ '* none: Do not process the document formatting. Everything is a '
+ 'paragraph and no styling is applied.\n'
+ '* heuristic: Process using heuristics to determine formatting such '
+ 'as chapter headings and italic text.\n'
+ '* markdown: Processing using markdown formatting. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
From 8bcdb0fed79c7b0f0b9fbb80d9b3a5b0c683c5d0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 8 Jan 2011 19:17:39 -0700
Subject: [PATCH 18/44] Fix #8007 (Search performance on multiple words)
---
src/calibre/library/caches.py | 28 +++++---
src/calibre/utils/search_query_parser.py | 83 ++++++++++++++++++------
2 files changed, 81 insertions(+), 30 deletions(-)
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index 980c9f1fa9..7caeeabda8 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -181,7 +181,7 @@ class ResultCache(SearchQueryParser): # {{{
self.search_restriction = ''
self.field_metadata = field_metadata
self.all_search_locations = field_metadata.get_search_terms()
- SearchQueryParser.__init__(self, self.all_search_locations)
+ SearchQueryParser.__init__(self, self.all_search_locations, optimize=True)
self.build_date_relop_dict()
self.build_numeric_relop_dict()
@@ -264,7 +264,7 @@ class ResultCache(SearchQueryParser): # {{{
'<=':[2, relop_le]
}
- def get_dates_matches(self, location, query):
+ def get_dates_matches(self, location, query, candidates):
matches = set([])
if len(query) < 2:
return matches
@@ -274,13 +274,15 @@ class ResultCache(SearchQueryParser): # {{{
loc = self.field_metadata[location]['rec_index']
if query == 'false':
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None: continue
if item[loc] is None or item[loc] <= UNDEFINED_DATE:
matches.add(item[0])
return matches
if query == 'true':
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None: continue
if item[loc] is not None and item[loc] > UNDEFINED_DATE:
matches.add(item[0])
@@ -319,7 +321,8 @@ class ResultCache(SearchQueryParser): # {{{
field_count = query.count('-') + 1
else:
field_count = query.count('/') + 1
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None or item[loc] is None: continue
if relop(item[loc], qd, field_count):
matches.add(item[0])
@@ -335,7 +338,7 @@ class ResultCache(SearchQueryParser): # {{{
'<=':[2, lambda r, q: r <= q]
}
- def get_numeric_matches(self, location, query, val_func = None):
+ def get_numeric_matches(self, location, query, candidates, val_func = None):
matches = set([])
if len(query) == 0:
return matches
@@ -381,7 +384,8 @@ class ResultCache(SearchQueryParser): # {{{
except:
return matches
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id_]
if item is None:
continue
v = val_func(item)
@@ -393,8 +397,13 @@ class ResultCache(SearchQueryParser): # {{{
matches.add(item[0])
return matches
- def get_matches(self, location, query, allow_recursion=True):
+ def get_matches(self, location, query, allow_recursion=True, candidates=None):
matches = set([])
+ if candidates is None:
+ candidates = self.universal_set()
+ if len(candidates) == 0:
+ return matches
+
if query and query.strip():
# get metadata key associated with the search term. Eliminates
# dealing with plurals and other aliases
@@ -476,7 +485,8 @@ class ResultCache(SearchQueryParser): # {{{
else:
q = query
- for item in self._data:
+ for id_ in candidates:
+ item = self._data[id]
if item is None: continue
if col_datatype[loc] == 'bool': # complexity caused by the two-/three-value tweak
diff --git a/src/calibre/utils/search_query_parser.py b/src/calibre/utils/search_query_parser.py
index db7c7bde5f..447ff8cd14 100644
--- a/src/calibre/utils/search_query_parser.py
+++ b/src/calibre/utils/search_query_parser.py
@@ -118,8 +118,9 @@ class SearchQueryParser(object):
failed.append(test[0])
return failed
- def __init__(self, locations, test=False):
+ def __init__(self, locations, test=False, optimize=False):
self._tests_failed = False
+ self.optimize = optimize
# Define a token
standard_locations = map(lambda x : CaselessLiteral(x)+Suppress(':'),
locations)
@@ -182,38 +183,52 @@ class SearchQueryParser(object):
# empty the list of searches used for recursion testing
self.recurse_level = 0
self.searches_seen = set([])
- return self._parse(query)
+ candidates = self.universal_set()
+ return self._parse(query, candidates)
# this parse is used internally because it doesn't clear the
# recursive search test list. However, we permit seeing the
# same search a few times because the search might appear within
# another search.
- def _parse(self, query):
+ def _parse(self, query, candidates=None):
self.recurse_level += 1
res = self._parser.parseString(query)[0]
- t = self.evaluate(res)
+ if candidates is None:
+ candidates = self.universal_set()
+ t = self.evaluate(res, candidates)
self.recurse_level -= 1
return t
def method(self, group_name):
return getattr(self, 'evaluate_'+group_name)
- def evaluate(self, parse_result):
- return self.method(parse_result.getName())(parse_result)
+ def evaluate(self, parse_result, candidates):
+ return self.method(parse_result.getName())(parse_result, candidates)
- def evaluate_and(self, argument):
- return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
+ def evaluate_and(self, argument, candidates):
+ # RHS checks only those items matched by LHS
+ # returns result of RHS check: RHmatches(LHmatches(c))
+ # return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
+ l = self.evaluate(argument[0], candidates)
+ return l.intersection(self.evaluate(argument[1], l))
- def evaluate_or(self, argument):
- return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
+ def evaluate_or(self, argument, candidates):
+ # RHS checks only those elements not matched by LHS
+ # returns LHS union RHS: LHmatches(c) + RHmatches(c-LHmatches(c))
+ # return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
+ l = self.evaluate(argument[0], candidates)
+ return l.union(self.evaluate(argument[1], candidates.difference(l)))
- def evaluate_not(self, argument):
- return self.universal_set().difference(self.evaluate(argument[0]))
+ def evaluate_not(self, argument, candidates):
+ # unary op checks only candidates. Result: list of items matching
+ # returns: c - matches(c)
+ # return self.universal_set().difference(self.evaluate(argument[0]))
+ return candidates.difference(self.evaluate(argument[0], candidates))
- def evaluate_parenthesis(self, argument):
- return self.evaluate(argument[0])
+ def evaluate_parenthesis(self, argument, candidates):
+ return self.evaluate(argument[0], candidates)
- def evaluate_token(self, argument):
+ def evaluate_token(self, argument, candidates):
location = argument[0]
query = argument[1]
if location.lower() == 'search':
@@ -224,17 +239,27 @@ class SearchQueryParser(object):
raise ParseException(query, len(query), 'undefined saved search', self)
if self.recurse_level > 5:
self.searches_seen.add(query)
- return self._parse(saved_searches().lookup(query))
+ return self._parse(saved_searches().lookup(query), candidates)
except: # convert all exceptions (e.g., missing key) to a parse error
raise ParseException(query, len(query), 'undefined saved search', self)
- return self.get_matches(location, query)
+ return self._get_matches(location, query, candidates)
- def get_matches(self, location, query):
+ def _get_matches(self, location, query, candidates):
+ if self.optimize:
+ return self.get_matches(location, query, candidates=candidates)
+ else:
+ return self.get_matches(location, query)
+
+ def get_matches(self, location, query, candidates=None):
'''
Should return the set of matches for :param:'location` and :param:`query`.
+ The search must be performed over all entries is :param:`candidates` is
+ None otherwise only over the items in candidates.
+
:param:`location` is one of the items in :member:`SearchQueryParser.DEFAULT_LOCATIONS`.
:param:`query` is a string literal.
+ :param: None or a subset of the set returned by :meth:`universal_set`.
'''
return set([])
@@ -561,7 +586,7 @@ class Tester(SearchQueryParser):
def universal_set(self):
return self._universal_set
- def get_matches(self, location, query):
+ def get_matches(self, location, query, candidates=None):
location = location.lower()
if location in self.fields.keys():
getter = operator.itemgetter(self.fields[location])
@@ -573,8 +598,13 @@ class Tester(SearchQueryParser):
if not query:
return set([])
query = query.lower()
- return set(key for key, val in self.texts.items() \
- if query and query in getattr(getter(val), 'lower', lambda : '')())
+ if candidates:
+ return set(key for key, val in self.texts.items() \
+ if key in candidates and query and query
+ in getattr(getter(val), 'lower', lambda : '')())
+ else:
+ return set(key for key, val in self.texts.items() \
+ if query and query in getattr(getter(val), 'lower', lambda : '')())
@@ -592,6 +622,7 @@ class Tester(SearchQueryParser):
def main(args=sys.argv):
+ print 'testing unoptimized'
tester = Tester(['authors', 'author', 'series', 'formats', 'format',
'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
@@ -601,6 +632,16 @@ def main(args=sys.argv):
print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
return 1
+ print '\n\ntesting optimized'
+ tester = Tester(['authors', 'author', 'series', 'formats', 'format',
+ 'publisher', 'rating', 'tags', 'tag', 'comments', 'comment', 'cover',
+ 'isbn', 'ondevice', 'pubdate', 'size', 'date', 'title', u'#read',
+ 'all', 'search'], test=True, optimize=True)
+ failed = tester.run_tests()
+ if tester._tests_failed or failed:
+ print '>>>>>>>>>>>>>> Tests Failed <<<<<<<<<<<<<<<'
+ return 1
+
return 0
if __name__ == '__main__':
From 5b8ea643214d2db4665614bc1046f8aa3db5c9cb Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 8 Jan 2011 19:19:28 -0700
Subject: [PATCH 19/44] And another droid
---
src/calibre/devices/android/driver.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py
index b7e2f0fd2e..73c930778e 100644
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@@ -29,7 +29,7 @@ class ANDROID(USBMS):
# Motorola
0x22b8 : { 0x41d9 : [0x216], 0x2d61 : [0x100], 0x2d67 : [0x100],
0x41db : [0x216], 0x4285 : [0x216], 0x42a3 : [0x216],
- 0x4286 : [0x216] },
+ 0x4286 : [0x216], 0x42b3 : [0x216] },
# Sony Ericsson
0xfce : { 0xd12e : [0x0100]},
From e9130241603a99f7e8dddfb8ff7df6edf4faacb5 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 10:40:30 +0800
Subject: [PATCH 20/44] ...
---
src/calibre/ebooks/txt/input.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index e2405de617..34a702cc55 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -23,7 +23,7 @@ class TXTInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto',
- choices=['auto', 'block', 'single', 'print'],
+ choices=['auto', 'block', 'single', 'print', 'unformatted'],
help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
'* auto: Try to auto detect paragraph type.\n'
From 289cdf33925dc4f80c08889e941becc9c3862471 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 10:43:24 +0800
Subject: [PATCH 21/44] changed unformatted description
---
src/calibre/ebooks/txt/input.py | 2 +-
src/calibre/ebooks/txt/processor.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 34a702cc55..9bc9323a4c 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -31,7 +31,7 @@ class TXTInput(InputFormatPlugin):
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.'
- '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
+ '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.'
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 9dc29e45dd..e26f0a9d07 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -117,7 +117,7 @@ def detect_paragraph_type(txt):
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
- unformatted: most lines have hard line breaks, few/no spaces or indents
+ unformatted: most lines have hard line breaks, few/no blank lines or indents
returns block, single, print, unformatted
'''
From f5a6195ceb697e6638bb7460ab9b2f1949a5342b Mon Sep 17 00:00:00 2001
From: John Schember
Date: Sun, 9 Jan 2011 00:02:24 -0500
Subject: [PATCH 22/44] TXT Output: Clean up and produce consistant output.
Spacing around headings. Headings are not indented when using the remove
paragraph spacing option.
---
src/calibre/ebooks/txt/txtml.py | 73 +++++++++++++++++++++++++--------
1 file changed, 57 insertions(+), 16 deletions(-)
diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py
index a3b4ed7afe..786f50824d 100644
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into plain text
'''
-import os
import re
from lxml import etree
@@ -33,6 +32,15 @@ BLOCK_STYLES = [
'block',
]
+HEADING_TAGS = [
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+]
+
SPACE_TAGS = [
'td',
'br',
@@ -47,6 +55,10 @@ class TXTMLizer(object):
self.log.info('Converting XHTML to TXT...')
self.oeb_book = oeb_book
self.opts = opts
+ self.toc_ids = []
+ self.last_was_heading = False
+
+ self.create_flat_toc(self.oeb_book.toc)
return self.mlize_spine()
@@ -58,8 +70,11 @@ class TXTMLizer(object):
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content)
- output += self.dump_text(etree.fromstring(content), stylizer)
- output = self.cleanup_text(u''.join(output))
+ output += self.dump_text(etree.fromstring(content), stylizer, item)
+ output += '\n\n\n\n\n\n'
+ output = u''.join(output)
+ output = u'\n'.join(l.rstrip() for l in output.splitlines())
+ output = self.cleanup_text(output)
return output
@@ -68,6 +83,8 @@ class TXTMLizer(object):
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
+ # Condense redundant spaces created by replacing newlines with spaces.
+ text = re.sub(r'[ ]{2,}', ' ', text)
return text
@@ -80,6 +97,14 @@ class TXTMLizer(object):
toc.append(u'* %s\n\n' % item.title)
return ''.join(toc)
+ def create_flat_toc(self, nodes):
+ '''
+ Turns a hierarchical list of TOC href's into a flat list.
+ '''
+ for item in nodes:
+ self.toc_ids.append(item.href)
+ self.create_flat_toc(item.nodes)
+
def cleanup_text(self, text):
self.log.debug('\tClean up text...')
# Replace bad characters.
@@ -92,7 +117,7 @@ class TXTMLizer(object):
text = text.replace('\f+', ' ')
# Single line paragraph.
- text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
+ text = re.sub('(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces.
text = re.sub('[ ]{2,}', ' ', text)
@@ -101,13 +126,19 @@ class TXTMLizer(object):
text = re.sub('\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text)
- text = re.sub('(?imu)^(?=.)', '\t', text)
+ text = re.sub(r'(?msu)^(?P[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
+ text = re.sub(r'(?msu)(?P[^\n])\n+(?P[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
else:
- text = re.sub('\n{3,}', '\n\n', text)
+ text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
# Replace spaces at the beginning and end of lines
+ # We don't replace tabs because those are only added
+ # when remove paragraph spacing is enabled.
text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text)
+
+ # Remove empty space and newlines at the beginning of the document.
+ text = re.sub(r'(?u)^[ \n]+', '', text)
if self.opts.max_line_length:
max_length = self.opts.max_line_length
@@ -145,13 +176,11 @@ class TXTMLizer(object):
return text
- def dump_text(self, elem, stylizer, end=''):
+ def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
- @end: The last two characters of the text from the previous element.
- This is used to determine if a blank line is needed when starting
- a new block element.
+ @page: OEB page used to determine absolute urls.
'''
if not isinstance(elem.tag, basestring) \
@@ -170,13 +199,22 @@ class TXTMLizer(object):
return ['']
tag = barename(elem.tag)
+ tag_id = elem.attrib.get('id', None)
in_block = False
+ in_heading = False
+
+ # Are we in a heading?
+ # This can either be a heading tag or a TOC item.
+ if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
+ in_heading = True
+ if not self.last_was_heading:
+ text.append('\n\n\n\n\n\n')
# Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
+ if self.opts.remove_paragraph_spacing and not in_heading:
+ text.append(u'\t')
in_block = True
- if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
- text.append(u'\n\n')
if tag in SPACE_TAGS:
text.append(u' ')
@@ -185,14 +223,17 @@ class TXTMLizer(object):
if hasattr(elem, 'text') and elem.text:
text.append(elem.text)
+ # Recurse down into tags within the tag we are in.
for item in elem:
- en = u''
- if len(text) >= 2:
- en = text[-1][-2:]
- text += self.dump_text(item, stylizer, en)
+ text += self.dump_text(item, stylizer, page)
if in_block:
text.append(u'\n\n')
+ if in_heading:
+ text.append(u'\n')
+ self.last_was_heading = True
+ else:
+ self.last_was_heading = False
if hasattr(elem, 'tail') and elem.tail:
text.append(elem.tail)
From f3a9f3f83f7da4821bdc1fca2ba0df66aca714e1 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 17:27:24 +0800
Subject: [PATCH 23/44] added dehyphenation to txt input
---
src/calibre/ebooks/conversion/preprocess.py | 15 +++++++++++----
src/calibre/ebooks/txt/input.py | 18 ++++++++++++------
2 files changed, 23 insertions(+), 10 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index ae111355e4..df9fd66407 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,6 +72,8 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
+ raw = raw.replace('\r\n', '\n')
+ raw = raw.replace('\r', '\n')
if format == 'html':
linere = re.compile('(?<=]*>\s*
).*?(?=
)', re.DOTALL)
elif format == 'pdf':
@@ -79,7 +81,7 @@ class DocAnalysis(object):
elif format == 'spanned_html':
linere = re.compile('(?<=)', re.DOTALL)
elif format == 'txt':
- linere = re.compile('.*?\n', re.DOTALL)
+ linere = re.compile('.*?\n')
self.lines = linere.findall(raw)
def line_length(self, percent):
@@ -177,7 +179,7 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@@ -194,7 +196,7 @@ class Dehyphenator(object):
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
- #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
try:
searchresult = self.html.find(lookupword.lower())
except:
@@ -225,8 +227,13 @@ class Dehyphenator(object):
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*([iubp]>\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*
\s*)?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length)
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
+ elif format == 'txt':
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
- intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet
+ elif format == 'individual_words_txt':
+ intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b')
+
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 9bc9323a4c..f6adb617c3 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
@@ -91,8 +92,16 @@ class TXTInput(InputFormatPlugin):
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
- log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
-
+ log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+
+ # Get length for hyphen removal and punctuation unwrap
+ docanalysis = DocAnalysis('txt', txt)
+ length = docanalysis.line_length(.5)
+
+ # Dehyphenate
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(txt,'txt', length)
+
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
@@ -103,10 +112,8 @@ class TXTInput(InputFormatPlugin):
if options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import PreProcessor
- from calibre.ebooks.conversion.preprocess import DocAnalysis
# get length
- docanalysis = DocAnalysis('txt', txt)
- length = docanalysis.line_length(.5)
+
# unwrap lines based on punctuation
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
@@ -117,7 +124,6 @@ class TXTInput(InputFormatPlugin):
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
else:
html = convert_basic(txt, epub_split_size_kb=flow_size)
-
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
From 696d9252324a5fa31ae91f8a3c5d472b5d5d953c Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 18:14:49 +0800
Subject: [PATCH 24/44] normalized line endings to simplify line length and
dehyphenation, fixes print formatted output for certain line endings
---
src/calibre/ebooks/conversion/preprocess.py | 10 +++++-----
src/calibre/ebooks/txt/input.py | 8 ++++++--
src/calibre/ebooks/txt/processor.py | 5 ++++-
3 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index df9fd66407..d9d735e391 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,8 +72,8 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
- raw = raw.replace('\r\n', '\n')
- raw = raw.replace('\r', '\n')
+ #raw = raw.replace('\r\n', '\n')
+ #raw = raw.replace('\r', '\n')
if format == 'html':
linere = re.compile('(?<=]*>\s*
).*?(?=
)', re.DOTALL)
elif format == 'pdf':
@@ -214,10 +214,10 @@ class Dehyphenator(object):
else:
if self.html.find(lookupword) != -1 or searchresult != -1:
- #print "returned dehyphenated word: " + str(dehyphenated)
+ print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
else:
- #print " returned hyphenated word: " + str(hyphenated)
+ print " returned hyphenated word: " + str(hyphenated)
return hyphenated
def __call__(self, html, format, length=1):
@@ -228,7 +228,7 @@ class Dehyphenator(object):
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
elif format == 'txt':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'individual_words_txt':
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index f6adb617c3..2e35e8e345 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
- convert_heuristic
+ convert_heuristic, normalize_line_endings
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+ # Normalize line endings
+ txt = normalize_line_endings(txt)
+
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
+ print "length is "+str(length)
# Dehyphenate
dehyphenator = Dehyphenator()
- html = dehyphenator(txt,'txt', length)
+ txt = dehyphenator(txt,'txt', length)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e26f0a9d07..ebdadebda2 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt))
-def separate_paragraphs_single_line(txt):
+def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
+ return txt
+
+def separate_paragraphs_single_line(txt):
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
return txt
From 0f109d699f06967394370150a0a35bf671a283c6 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 18:38:52 +0800
Subject: [PATCH 25/44] tweaked the auto-detection to handle cases where the
vast majority of the lines are formatted as block or print
---
src/calibre/ebooks/txt/processor.py | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index ebdadebda2..6a1a106681 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -133,15 +133,21 @@ def detect_paragraph_type(txt):
hardbreaks = docanalysis.line_histogram(.55)
if hardbreaks:
- # Check for print
+ # Determine print percentage
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
- if tab_line_count / float(txt_line_count) >= .15:
- return 'print'
-
- # Check for block
+ print_percent = tab_line_count / float(txt_line_count)
+
+ # Determine block percentage
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
- if empty_line_count / float(txt_line_count) >= .15:
- return 'block'
+ block_percent = empty_line_count / float(txt_line_count)
+
+ # Compare the two types - the type with the larger number of instances wins
+ # in cases where only one or the other represents the vast majority of the document neither wins
+ if print_percent >= block_percent:
+ if .15 <= print_percent <= .75:
+ return 'print'
+ elif .15 <= block_percent <= .75:
+ return 'block'
# Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted'
From c2cef786ce19b25cbdfc79c345d4cffa38885248 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 19:34:02 +0800
Subject: [PATCH 26/44] added partial dehyphenation for markdown
---
src/calibre/ebooks/conversion/preprocess.py | 16 +++++++--------
src/calibre/ebooks/txt/input.py | 22 +++++++++++++--------
2 files changed, 22 insertions(+), 16 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d9d735e391..e2c51846a4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -201,15 +201,15 @@ class Dehyphenator(object):
searchresult = self.html.find(lookupword.lower())
except:
return hyphenated
- if self.format == 'html_cleanup':
+ if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
- #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
elif self.html.find(hyphenated) != -1:
- #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ print "Cleanup:returned hyphenated word: " + str(hyphenated)
return hyphenated
else:
- #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf
else:
@@ -230,12 +230,12 @@ class Dehyphenator(object):
elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
- intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet
- elif format == 'individual_words_txt':
- intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b')
-
+ intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
+ elif format == 'txt_cleanup':
+ intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)')
+
html = intextmatch.sub(self.dehyphenate, html)
return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 2e35e8e345..5fbdc7131a 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -73,6 +73,14 @@ class TXTInput(InputFormatPlugin):
# followed by the entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
+
+ # Normalize line endings
+ txt = normalize_line_endings(txt)
+
+ # Get length for hyphen removal and punctuation unwrap
+ docanalysis = DocAnalysis('txt', txt)
+ length = docanalysis.line_length(.5)
+ print "length is "+str(length)
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
@@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin):
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
- # Normalize line endings
- txt = normalize_line_endings(txt)
-
- # Get length for hyphen removal and punctuation unwrap
- docanalysis = DocAnalysis('txt', txt)
- length = docanalysis.line_length(.5)
- print "length is "+str(length)
-
# Dehyphenate
dehyphenator = Dehyphenator()
txt = dehyphenator(txt,'txt', length)
@@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin):
else:
html = convert_basic(txt, epub_split_size_kb=flow_size)
+ # Dehyphenate in cleanup mode for missed txt and markdown conversion
+ print "going through final dehyphenation"
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'txt_cleanup', length)
+ html = dehyphenator(html,'html_cleanup', length)
+
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
From 9751f99db95185a9a6cdf66029f1d46e4a9d90d8 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 19:57:15 +0800
Subject: [PATCH 27/44] cleaned up print statements
---
src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------
src/calibre/ebooks/txt/input.py | 2 --
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e2c51846a4..32eee713fe 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -196,28 +196,28 @@ class Dehyphenator(object):
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
- print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
try:
searchresult = self.html.find(lookupword.lower())
except:
return hyphenated
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
- print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
elif self.html.find(hyphenated) != -1:
- print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ #print "Cleanup:returned hyphenated word: " + str(hyphenated)
return hyphenated
else:
- print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf
else:
if self.html.find(lookupword) != -1 or searchresult != -1:
- print "returned dehyphenated word: " + str(dehyphenated)
+ #print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
else:
- print " returned hyphenated word: " + str(hyphenated)
+ #print " returned hyphenated word: " + str(hyphenated)
return hyphenated
def __call__(self, html, format, length=1):
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5fbdc7131a..3957391494 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -80,7 +80,6 @@ class TXTInput(InputFormatPlugin):
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
- print "length is "+str(length)
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
@@ -130,7 +129,6 @@ class TXTInput(InputFormatPlugin):
html = convert_basic(txt, epub_split_size_kb=flow_size)
# Dehyphenate in cleanup mode for missed txt and markdown conversion
- print "going through final dehyphenation"
dehyphenator = Dehyphenator()
html = dehyphenator(html,'txt_cleanup', length)
html = dehyphenator(html,'html_cleanup', length)
From 7008e9b64cbe98ca43e77965a84a3f5af4e88f6d Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 21:56:12 +0800
Subject: [PATCH 28/44] ...
---
src/calibre/ebooks/conversion/preprocess.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 32eee713fe..08a46cb8d9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,8 +72,6 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
- #raw = raw.replace('\r\n', '\n')
- #raw = raw.replace('\r', '\n')
if format == 'html':
linere = re.compile('(?<=]*>\s*
).*?(?=
)', re.DOTALL)
elif format == 'pdf':
From 6871651ff1719971c1f52b4fb8ed6c2ae2025c44 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 9 Jan 2011 16:41:54 +0000
Subject: [PATCH 29/44] Fix bug in formatter where parse errors at end of file
threw an exception instead of providing the message.
---
src/calibre/utils/formatter.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py
index 2e4f843c3d..40760bf91b 100644
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@@ -98,9 +98,10 @@ class _Parser(object):
m = 'Formatter: ' + message + _(' near ')
if self.lex_pos > 0:
m = '{0} {1}'.format(m, self.prog[self.lex_pos-1][1])
- m = '{0} {1}'.format(m, self.prog[self.lex_pos][1])
- if self.lex_pos < len(self.prog):
+ elif self.lex_pos < len(self.prog):
m = '{0} {1}'.format(m, self.prog[self.lex_pos+1][1])
+ else:
+ m = '{0} {1}'.format(m, _('end of program'))
raise ValueError(m)
def token(self):
From 0067f6af4edf0a5645acc6a9a48a1608f60fb5a6 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 9 Jan 2011 16:42:11 +0000
Subject: [PATCH 30/44] Fix bug in formatter where parse errors at end of file
threw an exception instead of providing the message.
---
src/calibre/gui2/preferences/plugboard.py | 31 +++++++++++++++++++----
1 file changed, 26 insertions(+), 5 deletions(-)
diff --git a/src/calibre/gui2/preferences/plugboard.py b/src/calibre/gui2/preferences/plugboard.py
index 296387106c..e1dc6b03bd 100644
--- a/src/calibre/gui2/preferences/plugboard.py
+++ b/src/calibre/gui2/preferences/plugboard.py
@@ -5,11 +5,11 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-from PyQt4 import QtGui
-from PyQt4.Qt import Qt
+from PyQt4.Qt import Qt, QLineEdit, QComboBox, SIGNAL, QListWidgetItem
from calibre.gui2 import error_dialog
from calibre.gui2.device import device_name_for_plugboards
+from calibre.gui2.dialogs.template_dialog import TemplateDialog
from calibre.gui2.preferences import ConfigWidgetBase, test_widget
from calibre.gui2.preferences.plugboard_ui import Ui_Form
from calibre.customize.ui import metadata_writers, device_plugins
@@ -17,6 +17,27 @@ from calibre.library.save_to_disk import plugboard_any_format_value, \
plugboard_any_device_value, plugboard_save_to_disk_value
from calibre.utils.formatter import validation_formatter
+
+class LineEditWithTextBox(QLineEdit):
+
+ '''
+ Extend the context menu of a QLineEdit to include more actions.
+ '''
+
+ def contextMenuEvent(self, event):
+ menu = self.createStandardContextMenu()
+ menu.addSeparator()
+
+ action_open_editor = menu.addAction(_('Open Editor'))
+
+ self.connect(action_open_editor, SIGNAL('triggered()'), self.open_editor)
+ menu.exec_(event.globalPos())
+
+ def open_editor(self):
+ t = TemplateDialog(self, self.text())
+ if t.exec_():
+ self.setText(t.textbox.toPlainText())
+
class ConfigWidget(ConfigWidgetBase, Ui_Form):
def genesis(self, gui):
@@ -72,10 +93,10 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
self.source_widgets = []
self.dest_widgets = []
for i in range(0, len(self.dest_fields)-1):
- w = QtGui.QLineEdit(self)
+ w = LineEditWithTextBox(self)
self.source_widgets.append(w)
self.fields_layout.addWidget(w, 5+i, 0, 1, 1)
- w = QtGui.QComboBox(self)
+ w = QComboBox(self)
self.dest_widgets.append(w)
self.fields_layout.addWidget(w, 5+i, 1, 1, 1)
@@ -297,7 +318,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
for op in self.current_plugboards[f][d]:
ops.append('([' + op[0] + '] -> ' + op[1] + ')')
txt = '%s:%s = %s\n'%(f, d, ', '.join(ops))
- item = QtGui.QListWidgetItem(txt)
+ item = QListWidgetItem(txt)
item.setData(Qt.UserRole, (f, d))
self.existing_plugboards.addItem(item)
self.refilling = False
From cdc017bc6349cfb29944e7da4657c2ba42f122fd Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 9 Jan 2011 16:42:28 +0000
Subject: [PATCH 31/44] Fix #8244 Merging two books fails (None custom numeric
values throws exception)
---
src/calibre/library/custom_columns.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/calibre/library/custom_columns.py b/src/calibre/library/custom_columns.py
index d925f7c91d..d905f6d01a 100644
--- a/src/calibre/library/custom_columns.py
+++ b/src/calibre/library/custom_columns.py
@@ -151,6 +151,8 @@ class CustomColumns(object):
return v
def adapt_number(x, d):
+ if x is None:
+ return None
if isinstance(x, (str, unicode, bytes)):
if x.lower() == 'none':
return None
From 1670cd29bae7b41186141f902e0057676d985967 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 10:32:19 -0700
Subject: [PATCH 32/44] Cicero by mad
---
resources/recipes/cicero.recipe | 35 +++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
create mode 100644 resources/recipes/cicero.recipe
diff --git a/resources/recipes/cicero.recipe b/resources/recipes/cicero.recipe
new file mode 100644
index 0000000000..2df6b68000
--- /dev/null
+++ b/resources/recipes/cicero.recipe
@@ -0,0 +1,35 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Cicero(BasicNewsRecipe):
+ timefmt = ' [%Y-%m-%d]'
+ title = u'Cicero'
+ __author__ = 'mad@sharktooth.de'
+ description = u'Magazin f\xfcr politische Kultur'
+ oldest_article = 7
+ language = 'de'
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ publisher = 'Ringier Publishing'
+ category = 'news, politics, Germany'
+ encoding = 'iso-8859-1'
+ publication_type = 'magazine'
+ masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
+ feeds = [
+(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
+#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
+#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
+#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
+#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
+#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
+#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
+#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
+#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
+#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
+(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
+#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
+#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
+]
+
+ def print_version(self, url):
+ return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
From e58ccd8c5e4f4a251c8bf738a621d1a29c6e91da Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 10:55:15 -0700
Subject: [PATCH 33/44] Fix XSS vulnerability in content server. Fixes #7980
(Security vulnerability in Calibre 0.7.34)
---
src/calibre/library/server/browse.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py
index 37799c4cbc..3e4687be95 100644
--- a/src/calibre/library/server/browse.py
+++ b/src/calibre/library/server/browse.py
@@ -756,7 +756,7 @@ class BrowseServer(object):
sort = self.browse_sort_book_list(items, list_sort)
ids = [x[0] for x in items]
html = render_book_list(ids, self.opts.url_prefix,
- suffix=_('in search')+': '+query)
+ suffix=_('in search')+': '+xml(query))
return self.browse_template(sort, category=False, initial_search=query).format(
title=_('Matching books'),
script='booklist();', main=html)
From 6349979ca5a06d4efaf573902508fd5ac437128f Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 9 Jan 2011 18:15:02 +0000
Subject: [PATCH 34/44] Fix regression on empty ismult custom columns
---
src/calibre/library/custom_columns.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/calibre/library/custom_columns.py b/src/calibre/library/custom_columns.py
index d905f6d01a..f94081f046 100644
--- a/src/calibre/library/custom_columns.py
+++ b/src/calibre/library/custom_columns.py
@@ -197,8 +197,8 @@ class CustomColumns(object):
data = self.custom_column_num_map[num]
row = self.data._data[idx] if index_is_id else self.data[idx]
ans = row[self.FIELD_MAP[data['num']]]
- if ans and data['is_multiple'] and data['datatype'] == 'text':
- ans = ans.split('|')
+ if data['is_multiple'] and data['datatype'] == 'text':
+ ans = ans.split('|') if ans else []
if data['display'].get('sort_alpha', False):
ans.sort(cmp=lambda x,y:cmp(x.lower(), y.lower()))
return ans
From 31c354a164a8816576ce5194a6b0e1b5d64b6728 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 11:15:34 -0700
Subject: [PATCH 35/44] ...
---
setup/build_environment.py | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/setup/build_environment.py b/setup/build_environment.py
index 10ab1b0735..bdfddd2205 100644
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@@ -117,7 +117,6 @@ if iswindows:
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler']
@@ -131,7 +130,6 @@ elif isosx:
fc_lib = '/sw/lib'
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/sw/lib')
poppler_libs = ['poppler']
@@ -150,9 +148,6 @@ else:
# Include directories
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
'POPPLER_INC_DIR', '/usr/include/poppler')
- popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
- if not popplerqt4_inc_dirs:
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
'/usr/include')
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
@@ -187,13 +182,10 @@ if not poppler_inc_dirs or not os.path.exists(
poppler_error = \
('Poppler not found on your system. Various PDF related',
' functionality will not work. Use the POPPLER_INC_DIR and',
- ' POPPLER_LIB_DIR environment variables.')
-
-popplerqt4_error = None
-if not popplerqt4_inc_dirs or not os.path.exists(
- os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
- popplerqt4_error = \
- ('Poppler Qt4 bindings not found on your system.')
+ ' POPPLER_LIB_DIR environment variables. calibre requires '
+ ' the poppler XPDF headers. If your distro does not '
+ ' include them you will have to re-compile poppler '
+ ' by hand with --enable-xpdf-headers')
magick_error = None
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
From d63bfeff1158fc9f8ef9f7ba78cd7b39f18c9a98 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 11:18:35 -0700
Subject: [PATCH 36/44] ...
---
setup/build_environment.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup/build_environment.py b/setup/build_environment.py
index bdfddd2205..f0adaf9584 100644
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@@ -192,7 +192,7 @@ if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
'wand')):
magick_error = ('ImageMagick not found on your system. '
'Try setting the environment variables MAGICK_INC '
- 'and MAGICK_LIB to help calibre locate the inclue and libbrary '
+ 'and MAGICK_LIB to help calibre locate the include and library '
'files.')
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
From be03e57f2cf8d25b87e888b781ab14cc4ff3b20f Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 11:44:43 -0700
Subject: [PATCH 37/44] El Correo by desUBIKado
---
resources/recipes/el_correo.recipe | 122 +++++++++++++++++++++++++++++
1 file changed, 122 insertions(+)
create mode 100644 resources/recipes/el_correo.recipe
diff --git a/resources/recipes/el_correo.recipe b/resources/recipes/el_correo.recipe
new file mode 100644
index 0000000000..9190560b02
--- /dev/null
+++ b/resources/recipes/el_correo.recipe
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '08 Januery 2011, desUBIKado'
+__author__ = 'desUBIKado'
+__description__ = 'Daily newspaper from Biscay'
+__version__ = 'v0.08'
+__date__ = '08, Januery 2011'
+'''
+[url]http://www.elcorreo.com/[/url]
+'''
+
+import time
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class heraldo(BasicNewsRecipe):
+ __author__ = 'desUBIKado'
+ description = 'Daily newspaper from Biscay'
+ title = u'El Correo'
+ publisher = 'Vocento'
+ category = 'News, politics, culture, economy, general interest'
+ oldest_article = 2
+ delay = 1
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ language = 'es'
+ timefmt = '[%a, %d %b, %Y]'
+ encoding = 'iso-8859-1'
+ remove_empty_feeds = True
+ remove_javascript = False
+
+ feeds = [
+ (u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
+ (u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
+ (u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
+ (u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
+ (u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
+ (u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
+ (u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
+ (u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
+ (u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
+ (u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
+ (u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
+ ]
+
+ keep_only_tags = [
+ dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
+ dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
+ ]
+
+ remove_tags = [
+ dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
+ dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
+ dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
+ dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
+ dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
+ dict(name='div', attrs={'id':['articulopina']}),
+ dict(name='br', attrs={'class':'clear'}),
+ dict(name='form', attrs={'name':'frm_conversor2'})
+ ]
+
+ remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
+ remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
+
+ def get_cover_url(self):
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ #[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
+ #[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
+ cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
+
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nPortada no disponible")
+ cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
+ return cover
+
+ extra_css = '''
+ h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
+ h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
+ h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
+ h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ .date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
+ img{margin-bottom: 0.4em}
+ '''
+
+
+
+ preprocess_regexps = [
+
+ # To present the image of the embedded video
+ (re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '
'),
+ (re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '