From 093404e2081f9b430c19dc1cc994b35126849ab2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Dec 2009 16:48:36 -0700 Subject: [PATCH 1/7] Improved recipe for Newsweek --- resources/recipes/newsweek.recipe | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/resources/recipes/newsweek.recipe b/resources/recipes/newsweek.recipe index ff408ca9a5..f6da941361 100644 --- a/resources/recipes/newsweek.recipe +++ b/resources/recipes/newsweek.recipe @@ -33,19 +33,21 @@ class Newsweek(BasicNewsRecipe): language = 'en' remove_tags = [ - {'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content', + {'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content', 'inline-social-links-wrapper', 'email-article','ToolBox', + 'inline-promo-link', 'sponsorship', 'inlineComponentRight', 'comments-and-social-links-wrapper', 'EmailArticleBlock']}, {'id' : ['footer', 'ticker-data', 'topTenVertical', - 'digg-top-five', 'mesothorax', 'nw-comments', + 'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing', 'ToolBox', 'EmailMain']}, {'class': re.compile('related-cloud')}, dict(name='li', attrs={'id':['slug_bigbox']}) ] - keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent','photoBox']}, ] + keep_only_tags = [{'class':['article HorizontalHeader', + 'articlecontent','photoBox', 'article columnist first']}, ] recursions = 1 match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] From b632639ff790a77d741821952b77a54cefae633c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Dec 2009 08:47:37 -0700 Subject: [PATCH 2/7] ... --- src/calibre/ebooks/pdf/reflow.py | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 8cef0f327d..7afbb62b45 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -23,6 +23,7 @@ class Text(object): self.font_map = font_map self.top, self.left, self.width, self.height = map(float, map(text.get, ('top', 'left', 'width', 'height'))) + self.bottom = self.top + self.height self.font = self.font_map[text.get('font')] self.font_size = self.font.size self.color = self.font.color @@ -31,6 +32,18 @@ class Text(object): self.text_as_string = etree.tostring(text, method='text', encoding=unicode) +class FontSizeStats(dict): + + def __init__(self, stats): + total = float(sum(stats.values())) + self.most_common_size, self.chars_at_most_common_size = -1, 0 + + for sz, chars in stats.items(): + if chars >= self.chars_at_most_common_size: + self.most_common_size, self.chars_at_most_common_size = sz, chars + self[sz] = chars/total + + class Page(object): def __init__(self, page, font_map, opts, log): @@ -46,6 +59,15 @@ class Page(object): for text in page.xpath('descendant::text'): self.texts.append(Text(text, self.font_map, self.opts, self.log)) + self.font_size_stats = {} + for t in self.texts: + if t.font_size not in self.font_size_stats: + self.font_size_stats[t.font_size] = 0 + self.font_size_stats[t.font_size] += len(t.text_as_string) + + self.font_size_stats = FontSizeStats(self.font_size_stats) + + class PDFDocument(object): @@ -69,6 +91,17 @@ class PDFDocument(object): self.page_map[page.id] = page self.pages.append(page) + self.collect_font_statistics() + + def collect_font_statistics(self): + self.font_size_stats = {} + for p in self.pages: + for sz, chars in p.font_size_stats: + if sz not in self.font_size_stats: + self.font_size_stats[sz] = 0 + self.font_size_stats[sz] += chars + + self.font_size_stats = FontSizeStats(self.font_size_stats) From 3345f311dd2bdbc74ff9ff330bb73931f2133d99 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Dec 2009 08:50:57 -0700 Subject: [PATCH 3/7] Fix regression that broke device detection for Cybook devices in 0.6.28 on windows and linux --- src/calibre/devices/cybookg3/driver.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 1cf51f78ec..04e5e7012c 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -82,9 +82,8 @@ class CYBOOKG3(USBMS): def can_handle(cls, device_info, debug=False): USBMS.can_handle(device_info, debug) if islinux: - if device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3': - return True - return False + return device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3' + return True class CYBOOK_OPUS(CYBOOKG3): @@ -92,7 +91,7 @@ class CYBOOK_OPUS(CYBOOKG3): name = 'Cybook Opus Device Interface' gui_name = 'Cybook Opus' description = _('Communicate with the Cybook Opus eBook reader.') - author = _('John Schember') + author = 'John Schember' supported_platforms = ['windows', 'osx', 'linux'] FORMATS = ['epub', 'pdf', 'txt'] @@ -116,6 +115,5 @@ class CYBOOK_OPUS(CYBOOKG3): def can_handle(cls, device_info, debug=False): USBMS.can_handle(device_info, debug) if islinux: - if device_info[3] == 'Bookeen': - return True - return False + return device_info[3] == 'Bookeen' + return True From 2042052a92742482465398ab2e99368badef7bb2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Dec 2009 09:10:14 -0700 Subject: [PATCH 4/7] Improved recipe for the Financial Times --- resources/recipes/financial_times.recipe | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/resources/recipes/financial_times.recipe b/resources/recipes/financial_times.recipe index 9c42c1e8f7..2864f7be89 100644 --- a/resources/recipes/financial_times.recipe +++ b/resources/recipes/financial_times.recipe @@ -21,6 +21,7 @@ class FinancialTimes(BasicNewsRecipe): needs_subscription = True simultaneous_downloads= 1 delay = 1 + LOGIN = 'https://registration.ft.com/registration/barrier/login' def get_browser(self): @@ -38,7 +39,16 @@ class FinancialTimes(BasicNewsRecipe): remove_tags = [ dict(name='div', attrs={'id':'floating-con'}) ] - + + extra_css = ''' + body{font-family:Arial,Helvetica,sans-serif;} + h2(font-size:large;} + .ft-story-header(font-size:xx-small;} + .ft-story-body(font-size:small;} + a{color:#003399;} + .container{font-size:x-small;} + h3{font-size:x-small;color:#003399;} + ''' feeds = [ (u'UK' , u'http://www.ft.com/rss/home/uk' ) ,(u'US' , u'http://www.ft.com/rss/home/us' ) @@ -50,4 +60,4 @@ class FinancialTimes(BasicNewsRecipe): content_type = soup.find('meta', {'http-equiv':'Content-Type'}) if content_type: content_type['content'] = 'text/html; charset=utf-8' - return soup \ No newline at end of file + return soup From 6e25583bc0b29c87eea491f145fd7acec5a58c52 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Dec 2009 09:22:24 -0700 Subject: [PATCH 5/7] Fix #4220 (E-Book Viewer position in book navigation is broken) --- resources/recipes/financial_times.recipe | 20 ++++++++++---------- src/calibre/gui2/viewer/main.py | 5 ++++- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/resources/recipes/financial_times.recipe b/resources/recipes/financial_times.recipe index 2864f7be89..25efc56e45 100644 --- a/resources/recipes/financial_times.recipe +++ b/resources/recipes/financial_times.recipe @@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class FinancialTimes(BasicNewsRecipe): title = u'Financial Times' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = 'Financial world news' oldest_article = 2 language = 'en' @@ -21,9 +21,9 @@ class FinancialTimes(BasicNewsRecipe): needs_subscription = True simultaneous_downloads= 1 delay = 1 - + LOGIN = 'https://registration.ft.com/registration/barrier/login' - + def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: @@ -33,7 +33,7 @@ class FinancialTimes(BasicNewsRecipe): br['password'] = self.password br.submit() return br - + keep_only_tags = [ dict(name='div', attrs={'id':'cont'}) ] remove_tags_after = dict(name='p', attrs={'class':'copyright'}) remove_tags = [ @@ -48,12 +48,12 @@ class FinancialTimes(BasicNewsRecipe): a{color:#003399;} .container{font-size:x-small;} h3{font-size:x-small;color:#003399;} - ''' - feeds = [ - (u'UK' , u'http://www.ft.com/rss/home/uk' ) - ,(u'US' , u'http://www.ft.com/rss/home/us' ) - ,(u'Asia' , u'http://www.ft.com/rss/home/asia' ) - ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast') + ''' + feeds = [ + (u'UK' , u'http://www.ft.com/rss/home/uk' ) + ,(u'US' , u'http://www.ft.com/rss/home/us' ) + ,(u'Asia' , u'http://www.ft.com/rss/home/asia' ) + ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast') ] def preprocess_html(self, soup): diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index cb2f3da7d6..7030d2623d 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -228,7 +228,7 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.connect(self.action_bookmark, SIGNAL('triggered(bool)'), self.bookmark) self.connect(self.action_forward, SIGNAL('triggered(bool)'), self.forward) self.connect(self.action_preferences, SIGNAL('triggered(bool)'), lambda x: self.view.config(self)) - self.connect(self.pos, SIGNAL('valueChanged(double)'), self.goto_page) + self.pos.editingFinished.connect(self.goto_page_num) self.connect(self.vertical_scrollbar, SIGNAL('valueChanged(int)'), lambda x: self.goto_page(x/100.)) self.connect(self.search, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), self.find) @@ -319,6 +319,9 @@ class EbookViewer(MainWindow, Ui_EbookViewer): if pos is not None: self.goto_page(pos) + def goto_page_num(self): + num = self.pos.value() + self.goto_page(num) def forward(self, x): pos = self.history.forward() From ac9a4e11e54e8ef8f0d8f45c9401ca320a26fd5f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Dec 2009 16:15:25 -0700 Subject: [PATCH 6/7] ... --- src/calibre/ebooks/pdf/reflow.py | 88 ++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 7afbb62b45..53be9a23de 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -6,6 +6,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import sys + from lxml import etree class Font(object): @@ -24,6 +26,7 @@ class Text(object): self.top, self.left, self.width, self.height = map(float, map(text.get, ('top', 'left', 'width', 'height'))) self.bottom = self.top + self.height + self.right = self.left + self.width self.font = self.font_map[text.get('font')] self.font_size = self.font.size self.color = self.font.color @@ -43,6 +46,46 @@ class FontSizeStats(dict): self.most_common_size, self.chars_at_most_common_size = sz, chars self[sz] = chars/total +class Interval(object): + + def __init__(self, left, right): + self.left, self.right = left, right + self.width = right - left + + def intersection(self, other): + left = max(self.left, other.left) + right = min(self.right, other.right) + return Interval(left, right) + + def __nonzero__(self): + return self.width > 0 + + def __eq__(self, other): + return self.left == other.left and self.right == other.right + + def __hash__(self): + return hash('(%f,%f)'%self.left, self.right) + + +class HorizontalBox(object): + + def __init__(self, base_text): + self.texts = [base_text] + self.bottom = base_text.bottom + self.number_of_columns = None + self.column_map = {} + + def append(self, t): + self.texts.append(t) + + def sort(self): + self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left)) + self.top, self.bottom = sys.maxint, 0 + for t in self.texts: + self.top = min(self.top, t.top) + self.bottom = max(self.bottom, t.bottom) + self.left = self.texts[0].left + self.right = self.texts[-1].right class Page(object): @@ -55,9 +98,14 @@ class Page(object): self.id = 'page%d'%self.number self.texts = [] + self.left_margin, self.right_margin = self.width, 0 for text in page.xpath('descendant::text'): self.texts.append(Text(text, self.font_map, self.opts, self.log)) + self.left_margin = min(text.left, self.left_margin) + self.right_margin = max(text.right, self.right_margin) + + self.textwidth = self.right_margin - self.left_margin self.font_size_stats = {} for t in self.texts: @@ -67,6 +115,43 @@ class Page(object): self.font_size_stats = FontSizeStats(self.font_size_stats) + self.identify_columns() + + def sort_into_horizontal_boxes(self, document_font_size_stats): + self.horizontal_boxes = [] + + def find_closest_match(text): + 'Return horizontal box whose bottom is closest to text or None' + min, ans = 3.1, None + for hb in self.horizontal_boxes: + diff = abs(text.bottom - hb.bottom) + if diff < min: + diff, ans = min, hb + return ans + + for t in self.texts: + hb = find_closest_match(t) + if hb is None: + self.horizontal_boxes.append(HorizontalBox(t)) + else: + hb.append(t) + + + for hb in self.horizontal_boxes: + hb.sort() + + self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom)) + + def identify_columns(self): + + def neighborhood(i): + if i == 0: + return self.horizontal_boxes[1:3] + return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1]) + + for i, hbox in enumerate(self.horizontal_boxes): + pass + class PDFDocument(object): @@ -93,6 +178,9 @@ class PDFDocument(object): self.collect_font_statistics() + for page in self.pages: + page.sort_into_horizontal_boxes(self.font_size_stats) + def collect_font_statistics(self): self.font_size_stats = {} for p in self.pages: From d928b3479881b876d549dbf6f54545f2bc18986a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Dec 2009 16:16:43 -0700 Subject: [PATCH 7/7] PML/PDB Output: Use \CX tags to generate chapter index --- src/calibre/ebooks/pdb/ereader/writer.py | 52 +++++++++--------------- src/calibre/ebooks/pml/pmlml.py | 29 ++++++++++--- 2 files changed, 43 insertions(+), 38 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 263f6964bf..a379899af5 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -42,8 +42,8 @@ class Writer(FormatWriter): pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') text, text_sizes = self._text(pml) - chapter_index = self._chapter_index(pml) - link_index = self._link_index(pml) + chapter_index = self._index_item(r'(?s)\\C(?P\d)="(?P.+?)"', pml) + link_index = self._index_item(r'(?s)\\Q="(?P.+?)"', pml) images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) metadata = [self._metadata(metadata)] hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))] @@ -101,38 +101,24 @@ class Writer(FormatWriter): return pml_pages, text_sizes - def _index_item(self, mo): - index = '' - if 'text' in mo.groupdict().keys(): - index += struct.pack('>L', mo.start()) - text = mo.group('text') - # Strip all PML tags from text - text = re.sub(r'\\U[0-9a-z]{4}', '', text) - text = re.sub(r'\\a\d{3}', '', text) - text = re.sub(r'\\.', '', text) - # Add appropriate spacing to denote the various levels of headings - if 'val' in mo.groupdict().keys(): - text = '%s%s' % (' ' * 4 * int(mo.group('val')), text) - index += text - index += '\x00' - return index - - def _chapter_index(self, pml): - chapter_marks = [ - r'(?s)\\x(?P.+?)\\x', - r'(?s)\\X(?P[0-4])(?P.*?)\\X[0-4]', - r'(?s)\\C(?P\d)="(?P.+?)"', - ] + def _index_item(self, regex, pml): index = [] - for chapter_mark in chapter_marks: - for mo in re.finditer(chapter_mark, pml): - index.append(self._index_item(mo)) - return index - - def _link_index(self, pml): - index = [] - for mo in re.finditer(r'(?s)\\Q="(?P.+?)"', pml): - index.append(self._index_item(mo)) + for mo in re.finditer(regex, pml): + item = '' + if 'text' in mo.groupdict().keys(): + item += struct.pack('>L', mo.start()) + text = mo.group('text') + # Strip all PML tags from text + text = re.sub(r'\\U[0-9a-z]{4}', '', text) + text = re.sub(r'\\a\d{3}', '', text) + text = re.sub(r'\\.', '', text) + # Add appropriate spacing to denote the various levels of headings + if 'val' in mo.groupdict().keys(): + text = '%s%s' % (' ' * 4 * int(mo.group('val')), text) + item += text + item += '\x00' + if item: + index.append(item) return index def _images(self, manifest, image_hrefs): diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index b23cd40813..ccce95fce6 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -79,6 +79,16 @@ class PMLMLizer(object): self.log.info('Converting XHTML to PML markup...') self.oeb_book = oeb_book self.opts = opts + + # This is used for adding \CX tags chapter markers. This is separate + # from the optional inline toc. + self.toc = {} + for item in oeb_book.toc: + page, mid, id = item.href.partition('#') + if not self.toc.get(page, None): + self.toc[page] = {} + self.toc[page][id] = item.title + return self.pmlmlize_spine() def pmlmlize_spine(self): @@ -107,7 +117,11 @@ class PMLMLizer(object): return output def get_toc(self): - toc = [u''] + ''' + Generation of inline TOC + ''' + + toc = [] if self.opts.inline_toc: self.log.debug('Generating table of contents...') toc.append(u'\\X0%s\\X0\n\n' % _('Table of Contents:')) @@ -177,14 +191,14 @@ class PMLMLizer(object): def dump_text(self, elem, stylizer, page, tag_stack=[]): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: - return [u''] + return [] - text = [u''] + text = [] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': - return [u''] + return [] tag = barename(elem.tag) tag_count = 0 @@ -213,6 +227,12 @@ class PMLMLizer(object): else: w += '="50%"' text.append(w) + toc_id = elem.attrib.get('id', None) + if toc_id: + if self.toc.get(page.href, None): + toc_title = self.toc[page.href].get(toc_id, None) + if toc_title: + text.append('\\C1="%s"' % toc_title) # Process style information that needs holds a single tag # Commented out because every page in an OEB book starts with this style @@ -287,4 +307,3 @@ class PMLMLizer(object): if tag != 'block': text.append('\\%s' % tag) return text -