From 093404e2081f9b430c19dc1cc994b35126849ab2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 14 Dec 2009 16:48:36 -0700
Subject: [PATCH 1/7] Improved recipe for Newsweek

---
 resources/recipes/newsweek.recipe | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/resources/recipes/newsweek.recipe b/resources/recipes/newsweek.recipe
index ff408ca9a5..f6da941361 100644
--- a/resources/recipes/newsweek.recipe
+++ b/resources/recipes/newsweek.recipe
@@ -33,19 +33,21 @@ class Newsweek(BasicNewsRecipe):
     language = 'en'
 
     remove_tags = [
-            {'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
+            {'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
                 'inline-social-links-wrapper', 'email-article','ToolBox',
+                'inline-promo-link', 'sponsorship',
                 'inlineComponentRight',
                 'comments-and-social-links-wrapper', 'EmailArticleBlock']},
             {'id' : ['footer', 'ticker-data', 'topTenVertical',
-                'digg-top-five', 'mesothorax', 'nw-comments',
+                'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing',
                 'ToolBox', 'EmailMain']},
             {'class': re.compile('related-cloud')},
             dict(name='li', attrs={'id':['slug_bigbox']})
             ]
 
 
-    keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent','photoBox']}, ]
+    keep_only_tags = [{'class':['article HorizontalHeader',
+        'articlecontent','photoBox', 'article columnist first']}, ]
     recursions = 1
     match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
 

From b632639ff790a77d741821952b77a54cefae633c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Dec 2009 08:47:37 -0700
Subject: [PATCH 2/7] ...

---
 src/calibre/ebooks/pdf/reflow.py | 33 ++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 8cef0f327d..7afbb62b45 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -23,6 +23,7 @@ class Text(object):
         self.font_map = font_map
         self.top, self.left, self.width, self.height = map(float, map(text.get,
             ('top', 'left', 'width', 'height')))
+        self.bottom  = self.top + self.height
         self.font = self.font_map[text.get('font')]
         self.font_size = self.font.size
         self.color = self.font.color
@@ -31,6 +32,18 @@ class Text(object):
         self.text_as_string = etree.tostring(text, method='text',
                 encoding=unicode)
 
+class FontSizeStats(dict):
+
+    def __init__(self, stats):
+        total = float(sum(stats.values()))
+        self.most_common_size, self.chars_at_most_common_size = -1, 0
+
+        for sz, chars in stats.items():
+            if chars >= self.chars_at_most_common_size:
+                self.most_common_size, self.chars_at_most_common_size = sz, chars
+            self[sz] = chars/total
+
+
 class Page(object):
 
     def __init__(self, page, font_map, opts, log):
@@ -46,6 +59,15 @@ class Page(object):
         for text in page.xpath('descendant::text'):
             self.texts.append(Text(text, self.font_map, self.opts, self.log))
 
+        self.font_size_stats = {}
+        for t in self.texts:
+            if t.font_size not in self.font_size_stats:
+                self.font_size_stats[t.font_size] = 0
+            self.font_size_stats[t.font_size] += len(t.text_as_string)
+
+        self.font_size_stats = FontSizeStats(self.font_size_stats)
+
+
 
 class PDFDocument(object):
 
@@ -69,6 +91,17 @@ class PDFDocument(object):
             self.page_map[page.id] = page
             self.pages.append(page)
 
+        self.collect_font_statistics()
+
+    def collect_font_statistics(self):
+        self.font_size_stats = {}
+        for p in self.pages:
+            for sz, chars in p.font_size_stats:
+                if sz not in self.font_size_stats:
+                    self.font_size_stats[sz] = 0
+                self.font_size_stats[sz] += chars
+
+        self.font_size_stats = FontSizeStats(self.font_size_stats)
 
 
 

From 3345f311dd2bdbc74ff9ff330bb73931f2133d99 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Dec 2009 08:50:57 -0700
Subject: [PATCH 3/7] Fix regression that broke device detection for Cybook
 devices in 0.6.28 on windows and linux

---
 src/calibre/devices/cybookg3/driver.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py
index 1cf51f78ec..04e5e7012c 100644
--- a/src/calibre/devices/cybookg3/driver.py
+++ b/src/calibre/devices/cybookg3/driver.py
@@ -82,9 +82,8 @@ class CYBOOKG3(USBMS):
     def can_handle(cls, device_info, debug=False):
         USBMS.can_handle(device_info, debug)
         if islinux:
-            if device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3':
-                return True
-        return False
+            return device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3'
+        return True
 
 
 class CYBOOK_OPUS(CYBOOKG3):
@@ -92,7 +91,7 @@ class CYBOOK_OPUS(CYBOOKG3):
     name           = 'Cybook Opus Device Interface'
     gui_name       = 'Cybook Opus'
     description    = _('Communicate with the Cybook Opus eBook reader.')
-    author         = _('John Schember')
+    author         = 'John Schember'
     supported_platforms = ['windows', 'osx', 'linux']
 
     FORMATS = ['epub', 'pdf', 'txt']
@@ -116,6 +115,5 @@ class CYBOOK_OPUS(CYBOOKG3):
     def can_handle(cls, device_info, debug=False):
         USBMS.can_handle(device_info, debug)
         if islinux:
-            if device_info[3] == 'Bookeen':
-                return True
-        return False
+            return device_info[3] == 'Bookeen'
+        return True

From 2042052a92742482465398ab2e99368badef7bb2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Dec 2009 09:10:14 -0700
Subject: [PATCH 4/7] Improved recipe for the Financial Times

---
 resources/recipes/financial_times.recipe | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/resources/recipes/financial_times.recipe b/resources/recipes/financial_times.recipe
index 9c42c1e8f7..2864f7be89 100644
--- a/resources/recipes/financial_times.recipe
+++ b/resources/recipes/financial_times.recipe
@@ -21,6 +21,7 @@ class FinancialTimes(BasicNewsRecipe):
     needs_subscription    = True
     simultaneous_downloads= 1
     delay                 = 1
+    
     LOGIN = 'https://registration.ft.com/registration/barrier/login'
     
     def get_browser(self):
@@ -38,7 +39,16 @@ class FinancialTimes(BasicNewsRecipe):
     remove_tags = [
                      dict(name='div', attrs={'id':'floating-con'})
                   ]
-    
+
+    extra_css = '''
+                body{font-family:Arial,Helvetica,sans-serif;}
+                h2(font-size:large;}
+                .ft-story-header(font-size:xx-small;}
+                .ft-story-body(font-size:small;}
+                a{color:#003399;}
+                .container{font-size:x-small;}
+                h3{font-size:x-small;color:#003399;}
+                ''' 
     feeds = [ 
                (u'UK'         , u'http://www.ft.com/rss/home/uk'        ) 
               ,(u'US'         , u'http://www.ft.com/rss/home/us'        ) 
@@ -50,4 +60,4 @@ class FinancialTimes(BasicNewsRecipe):
         content_type = soup.find('meta', {'http-equiv':'Content-Type'})
         if content_type:
             content_type['content'] = 'text/html; charset=utf-8'
-        return soup
\ No newline at end of file
+        return soup

From 6e25583bc0b29c87eea491f145fd7acec5a58c52 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Dec 2009 09:22:24 -0700
Subject: [PATCH 5/7] Fix #4220 (E-Book Viewer position in book navigation is
 broken)

---
 resources/recipes/financial_times.recipe | 20 ++++++++++----------
 src/calibre/gui2/viewer/main.py          |  5 ++++-
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/resources/recipes/financial_times.recipe b/resources/recipes/financial_times.recipe
index 2864f7be89..25efc56e45 100644
--- a/resources/recipes/financial_times.recipe
+++ b/resources/recipes/financial_times.recipe
@@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 
 class FinancialTimes(BasicNewsRecipe):
     title                 = u'Financial Times'
-    __author__            = 'Darko Miletic'
+    __author__            = 'Darko Miletic and Sujata Raman'
     description           = 'Financial world news'
     oldest_article        = 2
     language = 'en'
@@ -21,9 +21,9 @@ class FinancialTimes(BasicNewsRecipe):
     needs_subscription    = True
     simultaneous_downloads= 1
     delay                 = 1
-    
+
     LOGIN = 'https://registration.ft.com/registration/barrier/login'
-    
+
     def get_browser(self):
         br = BasicNewsRecipe.get_browser()
         if self.username is not None and self.password is not None:
@@ -33,7 +33,7 @@ class FinancialTimes(BasicNewsRecipe):
             br['password'] = self.password
             br.submit()
         return br
-    
+
     keep_only_tags    = [ dict(name='div', attrs={'id':'cont'}) ]
     remove_tags_after = dict(name='p', attrs={'class':'copyright'})
     remove_tags = [
@@ -48,12 +48,12 @@ class FinancialTimes(BasicNewsRecipe):
                 a{color:#003399;}
                 .container{font-size:x-small;}
                 h3{font-size:x-small;color:#003399;}
-                ''' 
-    feeds = [ 
-               (u'UK'         , u'http://www.ft.com/rss/home/uk'        ) 
-              ,(u'US'         , u'http://www.ft.com/rss/home/us'        ) 
-              ,(u'Asia'       , u'http://www.ft.com/rss/home/asia'      ) 
-              ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast') 
+                '''
+    feeds = [
+               (u'UK'         , u'http://www.ft.com/rss/home/uk'        )
+              ,(u'US'         , u'http://www.ft.com/rss/home/us'        )
+              ,(u'Asia'       , u'http://www.ft.com/rss/home/asia'      )
+              ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast')
             ]
 
     def preprocess_html(self, soup):
diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py
index cb2f3da7d6..7030d2623d 100644
--- a/src/calibre/gui2/viewer/main.py
+++ b/src/calibre/gui2/viewer/main.py
@@ -228,7 +228,7 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
         self.connect(self.action_bookmark, SIGNAL('triggered(bool)'), self.bookmark)
         self.connect(self.action_forward, SIGNAL('triggered(bool)'), self.forward)
         self.connect(self.action_preferences, SIGNAL('triggered(bool)'), lambda x: self.view.config(self))
-        self.connect(self.pos, SIGNAL('valueChanged(double)'), self.goto_page)
+        self.pos.editingFinished.connect(self.goto_page_num)
         self.connect(self.vertical_scrollbar, SIGNAL('valueChanged(int)'),
                      lambda x: self.goto_page(x/100.))
         self.connect(self.search, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), self.find)
@@ -319,6 +319,9 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
         if pos is not None:
             self.goto_page(pos)
 
+    def goto_page_num(self):
+        num = self.pos.value()
+        self.goto_page(num)
 
     def forward(self, x):
         pos = self.history.forward()

From ac9a4e11e54e8ef8f0d8f45c9401ca320a26fd5f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Dec 2009 16:15:25 -0700
Subject: [PATCH 6/7] ...

---
 src/calibre/ebooks/pdf/reflow.py | 88 ++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 7afbb62b45..53be9a23de 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -6,6 +6,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
+import sys
+
 from lxml import etree
 
 class Font(object):
@@ -24,6 +26,7 @@ class Text(object):
         self.top, self.left, self.width, self.height = map(float, map(text.get,
             ('top', 'left', 'width', 'height')))
         self.bottom  = self.top + self.height
+        self.right = self.left + self.width
         self.font = self.font_map[text.get('font')]
         self.font_size = self.font.size
         self.color = self.font.color
@@ -43,6 +46,46 @@ class FontSizeStats(dict):
                 self.most_common_size, self.chars_at_most_common_size = sz, chars
             self[sz] = chars/total
 
+class Interval(object):
+
+    def __init__(self, left, right):
+        self.left, self.right = left, right
+        self.width = right - left
+
+    def intersection(self, other):
+        left = max(self.left, other.left)
+        right = min(self.right, other.right)
+        return Interval(left, right)
+
+    def __nonzero__(self):
+        return self.width > 0
+
+    def __eq__(self, other):
+        return self.left == other.left and self.right == other.right
+
+    def __hash__(self):
+        return hash('(%f,%f)'%self.left, self.right)
+
+
+class HorizontalBox(object):
+
+    def __init__(self, base_text):
+        self.texts = [base_text]
+        self.bottom = base_text.bottom
+        self.number_of_columns = None
+        self.column_map = {}
+
+    def append(self, t):
+        self.texts.append(t)
+
+    def sort(self):
+        self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left))
+        self.top, self.bottom = sys.maxint, 0
+        for t in self.texts:
+            self.top = min(self.top, t.top)
+            self.bottom = max(self.bottom, t.bottom)
+        self.left = self.texts[0].left
+        self.right = self.texts[-1].right
 
 class Page(object):
 
@@ -55,9 +98,14 @@ class Page(object):
         self.id = 'page%d'%self.number
 
         self.texts = []
+        self.left_margin, self.right_margin = self.width, 0
 
         for text in page.xpath('descendant::text'):
             self.texts.append(Text(text, self.font_map, self.opts, self.log))
+            self.left_margin = min(text.left, self.left_margin)
+            self.right_margin = max(text.right, self.right_margin)
+
+        self.textwidth = self.right_margin - self.left_margin
 
         self.font_size_stats = {}
         for t in self.texts:
@@ -67,6 +115,43 @@ class Page(object):
 
         self.font_size_stats = FontSizeStats(self.font_size_stats)
 
+        self.identify_columns()
+
+    def sort_into_horizontal_boxes(self, document_font_size_stats):
+        self.horizontal_boxes = []
+
+        def find_closest_match(text):
+            'Return horizontal box whose bottom is closest to text or None'
+            min, ans = 3.1, None
+            for hb in self.horizontal_boxes:
+                diff = abs(text.bottom - hb.bottom)
+                if diff < min:
+                    diff, ans = min, hb
+            return ans
+
+        for t in self.texts:
+            hb = find_closest_match(t)
+            if hb is None:
+                self.horizontal_boxes.append(HorizontalBox(t))
+            else:
+                hb.append(t)
+
+
+        for hb in self.horizontal_boxes:
+            hb.sort()
+
+        self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom))
+
+    def identify_columns(self):
+
+        def neighborhood(i):
+            if i == 0:
+                return self.horizontal_boxes[1:3]
+            return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1])
+
+        for i, hbox in enumerate(self.horizontal_boxes):
+            pass
+
 
 
 class PDFDocument(object):
@@ -93,6 +178,9 @@ class PDFDocument(object):
 
         self.collect_font_statistics()
 
+        for page in self.pages:
+            page.sort_into_horizontal_boxes(self.font_size_stats)
+
     def collect_font_statistics(self):
         self.font_size_stats = {}
         for p in self.pages:

From d928b3479881b876d549dbf6f54545f2bc18986a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Dec 2009 16:16:43 -0700
Subject: [PATCH 7/7] PML/PDB Output: Use \CX tags to generate chapter index

---
 src/calibre/ebooks/pdb/ereader/writer.py | 52 +++++++++---------------
 src/calibre/ebooks/pml/pmlml.py          | 29 ++++++++++---
 2 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py
index 263f6964bf..a379899af5 100644
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@@ -42,8 +42,8 @@ class Writer(FormatWriter):
         pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
 
         text, text_sizes = self._text(pml)
-        chapter_index = self._chapter_index(pml)
-        link_index = self._link_index(pml)
+        chapter_index = self._index_item(r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"', pml)
+        link_index = self._index_item(r'(?s)\\Q="(?P<text>.+?)"', pml)
         images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
         metadata = [self._metadata(metadata)]
         hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))]
@@ -101,38 +101,24 @@ class Writer(FormatWriter):
 
         return pml_pages, text_sizes
 
-    def _index_item(self, mo):
-        index = ''
-        if 'text' in mo.groupdict().keys():
-            index += struct.pack('>L', mo.start())
-            text = mo.group('text')
-            # Strip all PML tags from text
-            text = re.sub(r'\\U[0-9a-z]{4}', '', text)
-            text = re.sub(r'\\a\d{3}', '', text)
-            text = re.sub(r'\\.', '', text)
-            # Add appropriate spacing to denote the various levels of headings
-            if 'val' in mo.groupdict().keys():
-                text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
-            index += text
-            index += '\x00'
-        return index
-
-    def _chapter_index(self, pml):
-        chapter_marks = [
-            r'(?s)\\x(?P<text>.+?)\\x',
-            r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]',
-            r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"',
-        ]
+    def _index_item(self, regex, pml):
         index = []
-        for chapter_mark in chapter_marks:
-            for mo in re.finditer(chapter_mark, pml):
-                index.append(self._index_item(mo))
-        return index
-
-    def _link_index(self, pml):
-        index = []
-        for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml):
-            index.append(self._index_item(mo))
+        for mo in re.finditer(regex, pml):
+            item = ''
+            if 'text' in mo.groupdict().keys():
+                item += struct.pack('>L', mo.start())
+                text = mo.group('text')
+                # Strip all PML tags from text
+                text = re.sub(r'\\U[0-9a-z]{4}', '', text)
+                text = re.sub(r'\\a\d{3}', '', text)
+                text = re.sub(r'\\.', '', text)
+                # Add appropriate spacing to denote the various levels of headings
+                if 'val' in mo.groupdict().keys():
+                    text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
+                item += text
+                item += '\x00'
+            if item:
+                index.append(item)
         return index
 
     def _images(self, manifest, image_hrefs):
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index b23cd40813..ccce95fce6 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -79,6 +79,16 @@ class PMLMLizer(object):
         self.log.info('Converting XHTML to PML markup...')
         self.oeb_book = oeb_book
         self.opts = opts
+
+        # This is used for adding \CX tags chapter markers. This is separate
+        # from the optional inline toc.
+        self.toc = {}
+        for item in oeb_book.toc:
+            page, mid, id = item.href.partition('#')
+            if not self.toc.get(page, None):
+                self.toc[page] = {}
+            self.toc[page][id] = item.title
+
         return self.pmlmlize_spine()
 
     def pmlmlize_spine(self):
@@ -107,7 +117,11 @@ class PMLMLizer(object):
         return output
 
     def get_toc(self):
-        toc = [u'']
+        '''
+        Generation of inline TOC
+        '''
+
+        toc = []
         if self.opts.inline_toc:
             self.log.debug('Generating table of contents...')
             toc.append(u'\\X0%s\\X0\n\n' % _('Table of Contents:'))
@@ -177,14 +191,14 @@ class PMLMLizer(object):
     def dump_text(self, elem, stylizer, page, tag_stack=[]):
         if not isinstance(elem.tag, basestring) \
            or namespace(elem.tag) != XHTML_NS:
-            return [u'']
+            return []
 
-        text = [u'']
+        text = []
         style = stylizer.style(elem)
 
         if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
            or style['visibility'] == 'hidden':
-            return [u'']
+            return []
 
         tag = barename(elem.tag)
         tag_count = 0
@@ -213,6 +227,12 @@ class PMLMLizer(object):
             else:
                 w += '="50%"'
             text.append(w)
+        toc_id = elem.attrib.get('id', None)
+        if toc_id:
+            if self.toc.get(page.href, None):
+                toc_title = self.toc[page.href].get(toc_id, None)
+                if toc_title:
+                    text.append('\\C1="%s"' % toc_title)
 
         # Process style information that needs holds a single tag
         # Commented out because every page in an OEB book starts with this style
@@ -287,4 +307,3 @@ class PMLMLizer(object):
             if tag != 'block':
                 text.append('\\%s' % tag)
         return text
-