diff --git a/resources/recipes/financial_times.recipe b/resources/recipes/financial_times.recipe index 9c42c1e8f7..25efc56e45 100644 --- a/resources/recipes/financial_times.recipe +++ b/resources/recipes/financial_times.recipe @@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class FinancialTimes(BasicNewsRecipe): title = u'Financial Times' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = 'Financial world news' oldest_article = 2 language = 'en' @@ -21,8 +21,9 @@ class FinancialTimes(BasicNewsRecipe): needs_subscription = True simultaneous_downloads= 1 delay = 1 + LOGIN = 'https://registration.ft.com/registration/barrier/login' - + def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: @@ -32,22 +33,31 @@ class FinancialTimes(BasicNewsRecipe): br['password'] = self.password br.submit() return br - + keep_only_tags = [ dict(name='div', attrs={'id':'cont'}) ] remove_tags_after = dict(name='p', attrs={'class':'copyright'}) remove_tags = [ dict(name='div', attrs={'id':'floating-con'}) ] - - feeds = [ - (u'UK' , u'http://www.ft.com/rss/home/uk' ) - ,(u'US' , u'http://www.ft.com/rss/home/us' ) - ,(u'Asia' , u'http://www.ft.com/rss/home/asia' ) - ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast') + + extra_css = ''' + body{font-family:Arial,Helvetica,sans-serif;} + h2(font-size:large;} + .ft-story-header(font-size:xx-small;} + .ft-story-body(font-size:small;} + a{color:#003399;} + .container{font-size:x-small;} + h3{font-size:x-small;color:#003399;} + ''' + feeds = [ + (u'UK' , u'http://www.ft.com/rss/home/uk' ) + ,(u'US' , u'http://www.ft.com/rss/home/us' ) + ,(u'Asia' , u'http://www.ft.com/rss/home/asia' ) + ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast') ] def preprocess_html(self, soup): content_type = soup.find('meta', {'http-equiv':'Content-Type'}) if content_type: content_type['content'] = 'text/html; charset=utf-8' - return soup \ No newline at end of file + return soup diff --git a/resources/recipes/newsweek.recipe b/resources/recipes/newsweek.recipe index ff408ca9a5..f6da941361 100644 --- a/resources/recipes/newsweek.recipe +++ b/resources/recipes/newsweek.recipe @@ -33,19 +33,21 @@ class Newsweek(BasicNewsRecipe): language = 'en' remove_tags = [ - {'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content', + {'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content', 'inline-social-links-wrapper', 'email-article','ToolBox', + 'inline-promo-link', 'sponsorship', 'inlineComponentRight', 'comments-and-social-links-wrapper', 'EmailArticleBlock']}, {'id' : ['footer', 'ticker-data', 'topTenVertical', - 'digg-top-five', 'mesothorax', 'nw-comments', + 'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing', 'ToolBox', 'EmailMain']}, {'class': re.compile('related-cloud')}, dict(name='li', attrs={'id':['slug_bigbox']}) ] - keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent','photoBox']}, ] + keep_only_tags = [{'class':['article HorizontalHeader', + 'articlecontent','photoBox', 'article columnist first']}, ] recursions = 1 match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 77deb6efa5..e1d8aaa0c7 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -83,9 +83,7 @@ class CYBOOKG3(USBMS): def can_handle(cls, device_info, debug=False): USBMS.can_handle(device_info, debug) if islinux: - if device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3': - return True - return False + return device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3' return True @@ -94,7 +92,7 @@ class CYBOOK_OPUS(CYBOOKG3): name = 'Cybook Opus Device Interface' gui_name = 'Cybook Opus' description = _('Communicate with the Cybook Opus eBook reader.') - author = _('John Schember') + author = 'John Schember' supported_platforms = ['windows', 'osx', 'linux'] FORMATS = ['epub', 'pdf', 'txt'] @@ -118,7 +116,5 @@ class CYBOOK_OPUS(CYBOOKG3): def can_handle(cls, device_info, debug=False): USBMS.can_handle(device_info, debug) if islinux: - if device_info[3] == 'Bookeen': - return True - return False + return device_info[3] == 'Bookeen' return True diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 8cef0f327d..53be9a23de 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -6,6 +6,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import sys + from lxml import etree class Font(object): @@ -23,6 +25,8 @@ class Text(object): self.font_map = font_map self.top, self.left, self.width, self.height = map(float, map(text.get, ('top', 'left', 'width', 'height'))) + self.bottom = self.top + self.height + self.right = self.left + self.width self.font = self.font_map[text.get('font')] self.font_size = self.font.size self.color = self.font.color @@ -31,6 +35,58 @@ class Text(object): self.text_as_string = etree.tostring(text, method='text', encoding=unicode) +class FontSizeStats(dict): + + def __init__(self, stats): + total = float(sum(stats.values())) + self.most_common_size, self.chars_at_most_common_size = -1, 0 + + for sz, chars in stats.items(): + if chars >= self.chars_at_most_common_size: + self.most_common_size, self.chars_at_most_common_size = sz, chars + self[sz] = chars/total + +class Interval(object): + + def __init__(self, left, right): + self.left, self.right = left, right + self.width = right - left + + def intersection(self, other): + left = max(self.left, other.left) + right = min(self.right, other.right) + return Interval(left, right) + + def __nonzero__(self): + return self.width > 0 + + def __eq__(self, other): + return self.left == other.left and self.right == other.right + + def __hash__(self): + return hash('(%f,%f)'%self.left, self.right) + + +class HorizontalBox(object): + + def __init__(self, base_text): + self.texts = [base_text] + self.bottom = base_text.bottom + self.number_of_columns = None + self.column_map = {} + + def append(self, t): + self.texts.append(t) + + def sort(self): + self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left)) + self.top, self.bottom = sys.maxint, 0 + for t in self.texts: + self.top = min(self.top, t.top) + self.bottom = max(self.bottom, t.bottom) + self.left = self.texts[0].left + self.right = self.texts[-1].right + class Page(object): def __init__(self, page, font_map, opts, log): @@ -42,9 +98,60 @@ class Page(object): self.id = 'page%d'%self.number self.texts = [] + self.left_margin, self.right_margin = self.width, 0 for text in page.xpath('descendant::text'): self.texts.append(Text(text, self.font_map, self.opts, self.log)) + self.left_margin = min(text.left, self.left_margin) + self.right_margin = max(text.right, self.right_margin) + + self.textwidth = self.right_margin - self.left_margin + + self.font_size_stats = {} + for t in self.texts: + if t.font_size not in self.font_size_stats: + self.font_size_stats[t.font_size] = 0 + self.font_size_stats[t.font_size] += len(t.text_as_string) + + self.font_size_stats = FontSizeStats(self.font_size_stats) + + self.identify_columns() + + def sort_into_horizontal_boxes(self, document_font_size_stats): + self.horizontal_boxes = [] + + def find_closest_match(text): + 'Return horizontal box whose bottom is closest to text or None' + min, ans = 3.1, None + for hb in self.horizontal_boxes: + diff = abs(text.bottom - hb.bottom) + if diff < min: + diff, ans = min, hb + return ans + + for t in self.texts: + hb = find_closest_match(t) + if hb is None: + self.horizontal_boxes.append(HorizontalBox(t)) + else: + hb.append(t) + + + for hb in self.horizontal_boxes: + hb.sort() + + self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom)) + + def identify_columns(self): + + def neighborhood(i): + if i == 0: + return self.horizontal_boxes[1:3] + return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1]) + + for i, hbox in enumerate(self.horizontal_boxes): + pass + class PDFDocument(object): @@ -69,6 +176,20 @@ class PDFDocument(object): self.page_map[page.id] = page self.pages.append(page) + self.collect_font_statistics() + + for page in self.pages: + page.sort_into_horizontal_boxes(self.font_size_stats) + + def collect_font_statistics(self): + self.font_size_stats = {} + for p in self.pages: + for sz, chars in p.font_size_stats: + if sz not in self.font_size_stats: + self.font_size_stats[sz] = 0 + self.font_size_stats[sz] += chars + + self.font_size_stats = FontSizeStats(self.font_size_stats) diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index cb2f3da7d6..7030d2623d 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -228,7 +228,7 @@ class EbookViewer(MainWindow, Ui_EbookViewer): self.connect(self.action_bookmark, SIGNAL('triggered(bool)'), self.bookmark) self.connect(self.action_forward, SIGNAL('triggered(bool)'), self.forward) self.connect(self.action_preferences, SIGNAL('triggered(bool)'), lambda x: self.view.config(self)) - self.connect(self.pos, SIGNAL('valueChanged(double)'), self.goto_page) + self.pos.editingFinished.connect(self.goto_page_num) self.connect(self.vertical_scrollbar, SIGNAL('valueChanged(int)'), lambda x: self.goto_page(x/100.)) self.connect(self.search, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), self.find) @@ -319,6 +319,9 @@ class EbookViewer(MainWindow, Ui_EbookViewer): if pos is not None: self.goto_page(pos) + def goto_page_num(self): + num = self.pos.value() + self.goto_page(num) def forward(self, x): pos = self.history.forward()