From d0b195164604c133ba345e2c008ef5d8abe8db56 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 23 Jan 2011 16:01:34 -0500 Subject: [PATCH 01/16] Implement ticket #8504: Save previous filename import patterns and store them in a combo box. --- src/calibre/ebooks/metadata/meta.py | 8 +++++++- src/calibre/gui2/filename_pattern.ui | 16 +++++++++++++--- src/calibre/gui2/widgets.py | 24 ++++++++++++++++-------- 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index cbd9db3f04..b204e08bed 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -142,7 +142,13 @@ def metadata_from_filename(name, pat=None): name = name.rpartition('.')[0] mi = MetaInformation(None, None) if pat is None: - pat = re.compile(prefs.get('filename_pattern')) + pat_re = prefs.get('filename_pattern') + if isinstance(pat_re, list): + if pat_re: + pat_re = pat_re[0] + else: + pat_re = '' + pat = re.compile(pat_re) name = name.replace('_', ' ') match = pat.search(name) if match is not None: diff --git a/src/calibre/gui2/filename_pattern.ui b/src/calibre/gui2/filename_pattern.ui index d120ca80b2..e2367c8ceb 100644 --- a/src/calibre/gui2/filename_pattern.ui +++ b/src/calibre/gui2/filename_pattern.ui @@ -43,7 +43,17 @@ p, li { white-space: pre-wrap; } - + + + true + + + 10 + + + QComboBox::InsertAtTop + + @@ -94,8 +104,8 @@ p, li { white-space: pre-wrap; } 0 0 - 301 - 234 + 277 + 276 diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index 9e117822e4..a433a6e5d7 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -67,17 +67,21 @@ class FilenamePattern(QWidget, Ui_Form): self.setupUi(self) self.connect(self.test_button, SIGNAL('clicked()'), self.do_test) - self.connect(self.re, SIGNAL('returnPressed()'), self.do_test) - self.initialize() - self.re.textChanged.connect(lambda x: self.changed_signal.emit()) + self.connect(self.re.lineEdit(), SIGNAL('returnPressed()'), self.do_test) + self.re.lineEdit().textChanged.connect(lambda x: self.changed_signal.emit()) def initialize(self, defaults=False): if defaults: val = prefs.defaults['filename_pattern'] else: val = prefs['filename_pattern'] - self.re.setText(val) - + if isinstance(val, list): + if len(val) > 0: + for v in val: + self.re.addItem(v) + self.re.setCurrentIndex(0) + else: + self.re.lineEdit().setText(val if val else '') def do_test(self): try: @@ -110,12 +114,16 @@ class FilenamePattern(QWidget, Ui_Form): def pattern(self): - pat = unicode(self.re.text()) + pat = unicode(self.re.lineEdit().text()) return re.compile(pat) def commit(self): - pat = self.pattern().pattern - prefs['filename_pattern'] = pat + pat = [] + patterns = [unicode(self.re.lineEdit().text())] + [unicode(self.re.itemText(i)) for i in xrange(self.re.count())] + for p in patterns[:14]: + if p not in pat: + pat.append(p) + prefs['filename_pattern'] = pat return pat From 94fb463ab2af8d0350e4d2652a6f13e5adce55d9 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 23 Jan 2011 17:46:41 -0500 Subject: [PATCH 02/16] Heuristics, italicize common cases: Enhance pattern matching to match punctuation after pattern. --- src/calibre/ebooks/conversion/utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index aabb1b8bc4..ad7f5f117d 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -137,17 +137,17 @@ class HeuristicProcessor(object): ] ITALICIZE_STYLE_PATS = [ - r'(?msu)(?<=\s)_(?P\S[^_]{0,40}?\S)?_(?=\s)', - r'(?msu)(?<=\s)/(?P\S[^/]{0,40}?\S)?/(?=\s)', - r'(?msu)(?<=\s)~~(?P\S[^~]{0,40}?\S)?~~(?=\s)', - r'(?msu)(?<=\s)\*(?P\S[^\*]{0,40}?\S)?\*(?=\s)', - r'(?msu)(?<=\s)~(?P\S[^~]{0,40}?\S)?~(?=\s)', - r'(?msu)(?<=\s)_/(?P\S[^/_]{0,40}?\S)?/_(?=\s)', - r'(?msu)(?<=\s)_\*(?P\S[^\*_]{0,40}?\S)?\*_(?=\s)', - r'(?msu)(?<=\s)\*/(?P\S[^/\*]{0,40}?\S)?/\*(?=\s)', - r'(?msu)(?<=\s)_\*/(?P\S[^\*_]{0,40}?\S)?/\*_(?=\s)', - r'(?msu)(?<=\s)/:(?P\S[^:/]{0,40}?\S)?:/(?=\s)', - r'(?msu)(?<=\s)\|:(?P\S[^:\|]{0,40}?\S)?:\|(?=\s)', + r'(?msu)(?<=\s)_(?P\S[^_]{0,40}?\S)?_(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)/(?P\S[^/]{0,40}?\S)?/(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)~~(?P\S[^~]{0,40}?\S)?~~(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)\*(?P\S[^\*]{0,40}?\S)?\*(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)~(?P\S[^~]{0,40}?\S)?~(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)_/(?P\S[^/_]{0,40}?\S)?/_(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)_\*(?P\S[^\*_]{0,40}?\S)?\*_(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)\*/(?P\S[^/\*]{0,40}?\S)?/\*(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)_\*/(?P\S[^\*_]{0,40}?\S)?/\*_(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)/:(?P\S[^:/]{0,40}?\S)?:/(?=[\s\.,\!\?])', + r'(?msu)(?<=\s)\|:(?P\S[^:\|]{0,40}?\S)?:\|(?=[\s\.,\!\?])', ] for word in ITALICIZE_WORDS: From 156dc57e9996862a1a41a6ff5d48c286e7bb74e1 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 23 Jan 2011 20:06:59 -0500 Subject: [PATCH 03/16] GUI add regex: Store history in gprefs instead of in filename_pattern. --- src/calibre/ebooks/metadata/meta.py | 8 +----- src/calibre/gui2/widgets.py | 41 ++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index b204e08bed..cbd9db3f04 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -142,13 +142,7 @@ def metadata_from_filename(name, pat=None): name = name.rpartition('.')[0] mi = MetaInformation(None, None) if pat is None: - pat_re = prefs.get('filename_pattern') - if isinstance(pat_re, list): - if pat_re: - pat_re = pat_re[0] - else: - pat_re = '' - pat = re.compile(pat_re) + pat = re.compile(prefs.get('filename_pattern')) name = name.replace('_', ' ') match = pat.search(name) if match is not None: diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index a433a6e5d7..6380eab0b2 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -71,17 +71,27 @@ class FilenamePattern(QWidget, Ui_Form): self.re.lineEdit().textChanged.connect(lambda x: self.changed_signal.emit()) def initialize(self, defaults=False): + # Get all itmes in the combobox. If we are resting + # to defaults we don't want to lose what the user + # has added. + val_hist = [unicode(self.re.lineEdit().text())] + [unicode(self.re.itemText(i)) for i in xrange(self.re.count())] + self.re.clear() + if defaults: val = prefs.defaults['filename_pattern'] else: val = prefs['filename_pattern'] - if isinstance(val, list): - if len(val) > 0: - for v in val: - self.re.addItem(v) - self.re.setCurrentIndex(0) - else: - self.re.lineEdit().setText(val if val else '') + self.re.lineEdit().setText(val) + + val_hist += gprefs.get('filename_pattern_history', ['(?P.+)', '(?P<author>[^_-]+) -?\s*(?P<series>[^_0-9-]*)(?P<series_index>[0-9]*)\s*-\s*(?P<title>[^_].+) ?']) + if val in val_hist: + del val_hist[val_hist.index(val)] + val_hist.insert(0, val) + for v in val_hist: + # Ensure we don't have duplicate items. + if v and self.re.findText(v) == -1: + self.re.addItem(v) + self.re.setCurrentIndex(0) def do_test(self): try: @@ -118,12 +128,17 @@ class FilenamePattern(QWidget, Ui_Form): return re.compile(pat) def commit(self): - pat = [] - patterns = [unicode(self.re.lineEdit().text())] + [unicode(self.re.itemText(i)) for i in xrange(self.re.count())] - for p in patterns[:14]: - if p not in pat: - pat.append(p) - prefs['filename_pattern'] = pat + pat = self.pattern().pattern + prefs['filename_pattern'] = pat + + history = [] + history_pats = [unicode(self.re.lineEdit().text())] + [unicode(self.re.itemText(i)) for i in xrange(self.re.count())] + for p in history_pats[:14]: + # Ensure we don't have duplicate items. + if p and p not in history: + history.append(p) + gprefs['filename_pattern_history'] = history + return pat From 66fdb25b288c864c06d6ac4fee07c228a21155c1 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 23 Jan 2011 21:28:44 -0500 Subject: [PATCH 04/16] GUI, Search and Replace: Cache the conversion document used in the wizard so we don't have to run the it multiple times as the user sets sr 1, 2 and 3. --- src/calibre/gui2/convert/regex_builder.py | 29 +++++++++++++++---- .../gui2/convert/search_and_replace.py | 13 +++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/calibre/gui2/convert/regex_builder.py b/src/calibre/gui2/convert/regex_builder.py index bdd219d733..d3cb82465a 100644 --- a/src/calibre/gui2/convert/regex_builder.py +++ b/src/calibre/gui2/convert/regex_builder.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' import re -from PyQt4.QtCore import SIGNAL, Qt +from PyQt4.QtCore import SIGNAL, Qt, pyqtSignal from PyQt4.QtGui import QDialog, QWidget, QDialogButtonBox, \ QBrush, QTextCursor, QTextEdit @@ -19,8 +19,8 @@ from calibre.gui2.dialogs.choose_format import ChooseFormatDialog class RegexBuilder(QDialog, Ui_RegexBuilder): - def __init__(self, db, book_id, regex, *args): - QDialog.__init__(self, *args) + def __init__(self, db, book_id, regex, doc=None, parent=None): + QDialog.__init__(self, parent) self.setupUi(self) self.regex.setText(regex) @@ -28,9 +28,13 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): if not db or not book_id: self.button_box.addButton(QDialogButtonBox.Open) - elif not self.select_format(db, book_id): + elif not doc and not self.select_format(db, book_id): self.cancelled = True return + + if doc: + self.preview.setPlainText(doc) + self.cancelled = False self.connect(self.button_box, SIGNAL('clicked(QAbstractButton*)'), self.button_clicked) self.connect(self.regex, SIGNAL('textChanged(QString)'), self.regex_valid) @@ -152,25 +156,37 @@ class RegexBuilder(QDialog, Ui_RegexBuilder): self.open_book(files[0]) if button == self.button_box.button(QDialogButtonBox.Ok): self.accept() + + def doc(self): + return unicode(self.preview.toPlainText()) class RegexEdit(QWidget, Ui_Edit): + doc_update = pyqtSignal(unicode) + def __init__(self, parent=None): QWidget.__init__(self, parent) self.setupUi(self) self.book_id = None self.db = None + self.doc_cache = None self.connect(self.button, SIGNAL('clicked()'), self.builder) def builder(self): - bld = RegexBuilder(self.db, self.book_id, self.edit.text(), self) + bld = RegexBuilder(self.db, self.book_id, self.edit.text(), self.doc_cache, self) if bld.cancelled: return + if not self.doc_cache: + self.doc_cache = bld.doc() + self.doc_update.emit(self.doc_cache) if bld.exec_() == bld.Accepted: self.edit.setText(bld.regex.text()) + def doc(self): + return self.doc_cache + def setObjectName(self, *args): QWidget.setObjectName(self, *args) if hasattr(self, 'edit'): @@ -184,6 +200,9 @@ class RegexEdit(QWidget, Ui_Edit): def set_db(self, db): self.db = db + + def set_doc(self, doc): + self.doc_cache = doc def break_cycles(self): self.db = None diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py index ba156c5b2a..9c10ef667f 100644 --- a/src/calibre/gui2/convert/search_and_replace.py +++ b/src/calibre/gui2/convert/search_and_replace.py @@ -34,14 +34,27 @@ class SearchAndReplaceWidget(Widget, Ui_Form): self.opt_sr3_search.set_msg(_('&Search Regular Expression')) self.opt_sr3_search.set_book_id(book_id) self.opt_sr3_search.set_db(db) + + self.opt_sr1_search.doc_update.connect(self.update_doc) + self.opt_sr2_search.doc_update.connect(self.update_doc) + self.opt_sr3_search.doc_update.connect(self.update_doc) def break_cycles(self): Widget.break_cycles(self) + + self.opt_sr1_search.doc_update.disconnect() + self.opt_sr2_search.doc_update.disconnect() + self.opt_sr3_search.doc_update.disconnect() self.opt_sr1_search.break_cycles() self.opt_sr2_search.break_cycles() self.opt_sr3_search.break_cycles() + def update_doc(self, doc): + self.opt_sr1_search.set_doc(doc) + self.opt_sr2_search.set_doc(doc) + self.opt_sr3_search.set_doc(doc) + def pre_commit_check(self): for x in ('sr1_search', 'sr2_search', 'sr3_search'): x = getattr(self, 'opt_'+x) From 6e64f5ec4e0cb66cc0bf722ecf050f72fb9120dc Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 24 Jan 2011 12:20:50 +0800 Subject: [PATCH 05/16] doc tweaks, delete empty paragraphs during Heuristics --- src/calibre/ebooks/conversion/utils.py | 2 ++ src/calibre/manual/conversion.rst | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index aabb1b8bc4..14eca46b07 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -367,6 +367,8 @@ class HeuristicProcessor(object): html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) # Delete microsoft 'smart' tags html = re.sub('(?i)</?st1:\w+>', '', html) + # Delete self closing paragraph tags + html = re.sub('<p\s?/>', '', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html) diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 6ec986f26a..7f3ff21fe0 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -587,11 +587,11 @@ TXT input supports a number of options to differentiate how paragraphs are detec Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when the next line that starts with an indent is reached:: - This is the + This is the first. - This is the second. + This is the second. - This is the + This is the third. :guilabel:`Paragraph Style: Unformatted` @@ -603,7 +603,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec formatting will be applied. :guilabel:`Formatting Style: Heuristic` - Analyses the document for common chapter headings, scene breaks, and italicized words and applies the + Analyzes the document for common chapter headings, scene breaks, and italicized words and applies the appropriate html markup during conversion. :guilabel:`Formatting Style: Markdown` From 1e675af91aa665334261dafdc50f89724d2bf4f4 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 24 Jan 2011 12:22:42 +0800 Subject: [PATCH 06/16] Always apply title case to downloaded metadata - tired of crap metadata sources with 99% lowercase characters --- src/calibre/ebooks/metadata/fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 390f288d8e..8018f42b13 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -411,7 +411,7 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, r.pubdate = pubdate def fix_case(x): - if x and x.isupper(): + if x: x = titlecase(x) return x From a20cd3336dc5a309543dd5134a4f85bf563b9b09 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 24 Jan 2011 13:47:53 +0800 Subject: [PATCH 07/16] improved negative lookaheads in heuristics --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index e8c2acb9d6..22639801ff 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -203,8 +203,8 @@ class HeuristicProcessor(object): blank_lines = "" opt_title_open = "(" opt_title_close = ")?" - n_lookahead_open = "\s+(?!" - n_lookahead_close = ")" + n_lookahead_open = "(?!\s*" + n_lookahead_close = ")\s*" default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)" From d03ae9e001165d6edc82fbdbf884b89cd0b35ed9 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 24 Jan 2011 14:20:48 +0800 Subject: [PATCH 08/16] fixed a regression in chapter markup logic which was causing false negatives --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 22639801ff..1eb063cdd8 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -215,7 +215,7 @@ class HeuristicProcessor(object): [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines - [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters + [r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon @@ -275,7 +275,7 @@ class HeuristicProcessor(object): self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ") if type_name == 'common': analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) - elif self.min_chapters <= hits < max_chapters: + elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits: analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) break else: From 670fc644edb235b4a1fed74ded7924a2b8c13ec4 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 24 Jan 2011 16:00:05 +0800 Subject: [PATCH 09/16] made chapter markup routine more Sigil friendly --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 1eb063cdd8..d9350e6adb 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -43,7 +43,7 @@ class HeuristicProcessor(object): self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) - return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' + return '<h2 title="'+chap+', '+title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n' def chapter_break(self, match): chap = match.group('section') From 7485c9a5e200c20285ead3795025781f4d9ef31e Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 24 Jan 2011 16:30:29 +0800 Subject: [PATCH 10/16] fixed sigil integration to strip html from chapter titles, fixed softbreak handling and enabled integration with extra_css --- src/calibre/ebooks/conversion/utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index d9350e6adb..f6e259b6f9 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -25,13 +25,15 @@ class HeuristicProcessor(object): self.chapters_with_title = 0 self.blanks_deleted = False self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) - self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) + self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) + self.softbreak = re.compile(r'\s*(?P<openline><p(?=\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE) def is_pdftohtml(self, src): return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] def chapter_head(self, match): + from calibre.utils.html2text import html2text chap = match.group('chap') title = match.group('title') if not title: @@ -40,10 +42,12 @@ class HeuristicProcessor(object): " chapters. - " + unicode(chap)) return '<h2>'+chap+'</h2>\n' else: + txt_chap = html2text(chap) + txt_title = html2text(title) self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) - return '<h2 title="'+chap+', '+title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n' + return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n' def chapter_break(self, match): chap = match.group('section') @@ -469,7 +473,7 @@ class HeuristicProcessor(object): if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log.debug("deleting blank lines") self.blanks_deleted = True - html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html) + html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html) html = self.blankreg.sub('', html) # Determine line ending type @@ -524,11 +528,11 @@ class HeuristicProcessor(object): # Center separator lines html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html) if not self.blanks_deleted: - html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html) - html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html) + html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html) + html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs to preserve original formatting html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html) - + html = self.softbreak.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html) return html From 45ee4769896363190af6a2ab00dcdc9597de140c Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 24 Jan 2011 09:22:43 -0700 Subject: [PATCH 11/16] Updated recipes for Heise, HNA and ZDNET --- resources/recipes/heise.recipe | 2 ++ resources/recipes/hna.recipe | 3 ++- resources/recipes/zdnet.recipe | 25 ++++++++++++++++++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/resources/recipes/heise.recipe b/resources/recipes/heise.recipe index 9edf3774fc..56d5516656 100644 --- a/resources/recipes/heise.recipe +++ b/resources/recipes/heise.recipe @@ -52,6 +52,7 @@ class heiseDe(BasicNewsRecipe): dict(id='navi_login'), dict(id='navigation'), dict(id='breadcrumb'), + dict(id='adblockerwarnung'), dict(id=''), dict(id='sitemap'), dict(id='bannerzone'), @@ -67,3 +68,4 @@ class heiseDe(BasicNewsRecipe): + diff --git a/resources/recipes/hna.recipe b/resources/recipes/hna.recipe index 6e843800ee..e3349f0c7b 100644 --- a/resources/recipes/hna.recipe +++ b/resources/recipes/hna.recipe @@ -21,7 +21,7 @@ class hnaDe(BasicNewsRecipe): max_articles_per_feed = 40 no_stylesheets = True remove_javascript = True - encoding = 'iso-8859-1' + encoding = 'utf-8' remove_tags = [dict(id='topnav'), dict(id='nav_main'), @@ -60,3 +60,4 @@ class hnaDe(BasicNewsRecipe): feeds = [ ('hna_soehre', 'http://feeds2.feedburner.com/hna/soehre'), ('hna_kassel', 'http://feeds2.feedburner.com/hna/kassel') ] + diff --git a/resources/recipes/zdnet.recipe b/resources/recipes/zdnet.recipe index 9673eb1fcf..1a0f1562b5 100644 --- a/resources/recipes/zdnet.recipe +++ b/resources/recipes/zdnet.recipe @@ -27,12 +27,34 @@ class cdnet(BasicNewsRecipe): dict(id='header'), dict(id='search'), dict(id='nav'), + dict(id='blog-author-info'), + dict(id='post-tags'), + dict(id='bio-naraine'), + dict(id='bio-kennedy'), + dict(id='author-short-disclosure-kennedy'), dict(id=''), dict(name='div', attrs={'class':'banner'}), + dict(name='div', attrs={'class':'int'}), + dict(name='div', attrs={'class':'talkback clear space-2'}), + dict(name='div', attrs={'class':'content-1 clear'}), + dict(name='div', attrs={'class':'space-2'}), + dict(name='div', attrs={'class':'space-3'}), + dict(name='div', attrs={'class':'thumb-2 left'}), + dict(name='div', attrs={'class':'hotspot'}), + dict(name='div', attrs={'class':'hed hed-1 space-1'}), + dict(name='div', attrs={'class':'view-1 clear content-3 space-2'}), + dict(name='div', attrs={'class':'hed hed-1 space-1'}), + dict(name='div', attrs={'class':'hed hed-1'}), + dict(name='div', attrs={'class':'post-header'}), + dict(name='div', attrs={'class':'lvl-nav clear'}), + dict(name='div', attrs={'class':'t-share-overlay overlay-pop contain-overlay-4'}), dict(name='p', attrs={'class':'tags'}), + dict(name='span', attrs={'class':'follow'}), + dict(name='span', attrs={'class':'int'}), + dict(name='h4', attrs={'class':'h s-4'}), dict(name='a', attrs={'href':'http://www.twitter.com/ryanaraine'}), dict(name='div', attrs={'class':'special1'})] - remove_tags_after = [dict(name='div', attrs={'class':'bloggerDesc clear'})] + remove_tags_after = [dict(name='div', attrs={'class':'clear'})] feeds = [ ('zdnet', 'http://feeds.feedburner.com/zdnet/security') ] @@ -43,3 +65,4 @@ class cdnet(BasicNewsRecipe): return soup + From ffb62ece0e975a526ad60b14a5e7ab46f413b8d3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 24 Jan 2011 09:32:23 -0700 Subject: [PATCH 12/16] Fix #8031 (Output Options) --- src/calibre/gui2/widgets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index 9e117822e4..8077fe46f5 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -16,7 +16,6 @@ from PyQt4.Qt import QIcon, QFont, QLabel, QListWidget, QAction, \ QTimer, QRect from calibre.gui2 import NONE, error_dialog, pixmap_to_data, gprefs -from calibre.constants import isosx from calibre.gui2.filename_pattern_ui import Ui_Form from calibre import fit_image from calibre.ebooks import BOOK_EXTENSIONS @@ -304,8 +303,9 @@ class FontFamilyModel(QAbstractListModel): return NONE if role == Qt.DisplayRole: return QVariant(family) - if not isosx and role == Qt.FontRole: - # Causes a Qt crash with some fonts on OS X + if False and role == Qt.FontRole: + # Causes a Qt crash with some fonts + # so disabled. return QVariant(QFont(family)) return NONE From 5474aa42682d79c9967544f93e38103d70c0b841 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 24 Jan 2011 09:47:20 -0700 Subject: [PATCH 13/16] tportal.h by DM. Fixes #8552 (New recipe for daily tportal) --- resources/images/news/dailytportal.png | Bin 0 -> 635 bytes resources/recipes/dailytportal.recipe | 66 +++++++++++++++++++++++++ src/calibre/utils/localization.py | 1 + 3 files changed, 67 insertions(+) create mode 100644 resources/images/news/dailytportal.png create mode 100644 resources/recipes/dailytportal.recipe diff --git a/resources/images/news/dailytportal.png b/resources/images/news/dailytportal.png new file mode 100644 index 0000000000000000000000000000000000000000..38b06e675a24eccbca5297987fe696e531bf9dfd GIT binary patch literal 635 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87#Np$x;TbdoQ|EmuusfUpk@E7nf4V;Oe!{v z86sU1J}{_g2sIsCXvk77*68$+!G^`sLhpxlrTT++tQ@;<x6YXP*SvoA+`I4At(*1g z>e=7?YkgEE3T@Loz|_Lqpu8ee&5^NbX6LPQi#1fIoIJNiA%1_&qhEj2S!X!+%vf=u zJ><3v%jRCyG%Z)|ly$tt>^_|=Uh-T5eG0}ag;;kSj6alDWaSpw{r%H7hPemr#dALY zej}cLf6tOE|DUhkw7mYD+_dxJw1_qq&tAq(<saW3Pw75w@0)DW@c8|53zwjcu2qbk za~EDSQ3y(SaPxDx%XHH%cC{aODSEy!Vr-0#pS$?39b4(!%x0I0^|G_9i&?f!Y*O94 zTVdh4?CQglPgU8S(KieAHZI9r^1<WVvQ>Mhw(q?3UD2yP@O$kAz1hnyT4c3zPwEJq zAY`3Z^!^^#wyB0UdKeRDv)+&uVfLx5oo`cj<=W2ceK#bXgdTod>~m#mmV}<(dWIi= zK6b3$EPQHNC1X&4_<}E{_ZBUGaQ8QVh}ye*JNMS<&t9He^7<aLdHm5to&yJqHr(d> z#eYQS#k?EWwH^W^PqoA~q9i4;B-JXpC>2OC7#SEE>KYj88kvL`SX!BwS{WJW8kk!d s7$g;NK0wiso1c=IR*9^^(7?*r2#6pWW~RFK12r&sy85}Sb4q9e0DvF?i~s-t literal 0 HcmV?d00001 diff --git a/resources/recipes/dailytportal.recipe b/resources/recipes/dailytportal.recipe new file mode 100644 index 0000000000..6e2646bfca --- /dev/null +++ b/resources/recipes/dailytportal.recipe @@ -0,0 +1,66 @@ +__license__ = 'GPL v3' +__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>' +''' +daily.tportal.hr +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Pagina12(BasicNewsRecipe): + title = 'Daily tportal.h' + __author__ = 'Darko Miletic' + description = 'News from Croatia' + publisher = 'tportal.hr' + category = 'news, politics, Croatia' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'en_HR' + remove_empty_feeds = True + publication_type = 'newsportal' + extra_css = """ + body{font-family: Verdana,sans-serif } + img{margin-bottom: 0.4em; display:block} + h1,h2{color: #2D648A; font-family: Georgia,serif} + .artAbstract{font-size: 1.2em; font-family: Georgia,serif} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [ + dict(name=['meta','link','embed','object','iframe','base']) + ,dict(name='div', attrs={'class':'artInfo'}) + ] + remove_attributes=['lang'] + + keep_only_tags=dict(attrs={'class':'articleDetails'}) + + feeds = [(u'News', u'http://daily.tportal.hr/rss/dailynaslovnicarss.xml')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup + diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index d452721113..b9995db2bf 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -105,6 +105,7 @@ _extra_lang_codes = { 'en_TH' : _('English (Thailand)'), 'en_CY' : _('English (Cyprus)'), 'en_PK' : _('English (Pakistan)'), + 'en_HR' : _('English (Croatia)'), 'en_IL' : _('English (Israel)'), 'en_SG' : _('English (Singapore)'), 'en_YE' : _('English (Yemen)'), From e1ebc4946f2f0fa744682440e7b363202036cf56 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 24 Jan 2011 12:46:12 -0700 Subject: [PATCH 14/16] Fix #8558 (Tag editor issues with version 0.7.42) --- src/calibre/gui2/dialogs/tag_editor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/dialogs/tag_editor.py b/src/calibre/gui2/dialogs/tag_editor.py index d4325354a1..6bd8eb7dbe 100644 --- a/src/calibre/gui2/dialogs/tag_editor.py +++ b/src/calibre/gui2/dialogs/tag_editor.py @@ -16,7 +16,7 @@ class TagEditor(QDialog, Ui_TagEditor): self.setupUi(self) self.db = db - self.index = db.row(id_) + self.index = db.row(id_) if id_ is not None else None if self.index is not None: tags = self.db.tags(self.index) else: From 135edd0a7c2a20847cc2b7f9a49c6803460984d0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 24 Jan 2011 12:54:50 -0700 Subject: [PATCH 15/16] Bulk metadata edit dialog: Remember last used size. Fixes #8525 (Edit Multiple Books Window Size) --- src/calibre/gui2/dialogs/metadata_bulk.py | 14 +++++++++++++- src/calibre/gui2/dialogs/metadata_bulk.ui | 12 ++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py index 36a035cd94..cf4252e9ed 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.py +++ b/src/calibre/gui2/dialogs/metadata_bulk.py @@ -15,7 +15,7 @@ from calibre.ebooks.metadata import string_to_authors, authors_to_string from calibre.ebooks.metadata.book.base import composite_formatter from calibre.ebooks.metadata.meta import get_metadata from calibre.gui2.custom_column_widgets import populate_metadata_page -from calibre.gui2 import error_dialog, ResizableDialog, UNDEFINED_QDATE +from calibre.gui2 import error_dialog, ResizableDialog, UNDEFINED_QDATE, gprefs from calibre.gui2.progress_indicator import ProgressIndicator from calibre.utils.config import dynamic, JSONConfig from calibre.utils.titlecase import titlecase @@ -321,8 +321,15 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog): 'This operation cannot be canceled or undone')) self.do_again = False self.central_widget.setCurrentIndex(tab) + geom = gprefs.get('bulk_metadata_window_geometry', None) + if geom is not None: + self.restoreGeometry(bytes(geom)) self.exec_() + def save_state(self, *args): + gprefs['bulk_metadata_window_geometry'] = \ + bytearray(self.saveGeometry()) + def do_apply_pubdate(self, *args): self.apply_pubdate.setChecked(True) @@ -790,7 +797,12 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog): self.series_start_number.setEnabled(False) self.series_start_number.setValue(1) + def reject(self): + self.save_state() + ResizableDialog.reject(self) + def accept(self): + self.save_state() if len(self.ids) < 1: return QDialog.accept(self) diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui index 481a485bc2..163d49b328 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.ui +++ b/src/calibre/gui2/dialogs/metadata_bulk.ui @@ -6,8 +6,8 @@ <rect> <x>0</x> <y>0</y> - <width>850</width> - <height>700</height> + <width>962</width> + <height>727</height> </rect> </property> <property name="windowTitle"> @@ -44,8 +44,8 @@ <rect> <x>0</x> <y>0</y> - <width>842</width> - <height>639</height> + <width>954</width> + <height>666</height> </rect> </property> <layout class="QVBoxLayout" name="verticalLayout_2"> @@ -996,8 +996,8 @@ not multiple and the destination field is multiple</string> <rect> <x>0</x> <y>0</y> - <width>826</width> - <height>323</height> + <width>197</width> + <height>60</height> </rect> </property> <layout class="QGridLayout" name="testgrid"> From e526bcc3361ba6bdf634220a2223a96ff7cec969 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 24 Jan 2011 13:11:38 -0700 Subject: [PATCH 16/16] Fix #8563 (Calibre not recognizing archos 70 250GB) --- src/calibre/devices/android/driver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index a95e3c46fa..16022fc752 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -54,7 +54,7 @@ class ANDROID(USBMS): 0x1004 : { 0x61cc : [0x100] }, # Archos - 0x0e79 : { 0x1419: [0x0216], 0x1420 : [0x0216]}, + 0x0e79 : { 0x1419: [0x0216], 0x1420 : [0x0216], 0x1422 : [0x0216]}, } EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books'] @@ -70,7 +70,7 @@ class ANDROID(USBMS): '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897', 'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE', - 'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT'] + 'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT', 'A70H'] WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', 'A70S', 'A101IT']