Sync to Trunk.

This commit is contained in:
John Schember 2009-12-15 19:31:55 -05:00
commit 83d4deac87
5 changed files with 153 additions and 21 deletions

View File

@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class FinancialTimes(BasicNewsRecipe):
title = u'Financial Times'
__author__ = 'Darko Miletic'
__author__ = 'Darko Miletic and Sujata Raman'
description = 'Financial world news'
oldest_article = 2
language = 'en'
@ -21,8 +21,9 @@ class FinancialTimes(BasicNewsRecipe):
needs_subscription = True
simultaneous_downloads= 1
delay = 1
LOGIN = 'https://registration.ft.com/registration/barrier/login'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
@ -32,22 +33,31 @@ class FinancialTimes(BasicNewsRecipe):
br['password'] = self.password
br.submit()
return br
keep_only_tags = [ dict(name='div', attrs={'id':'cont'}) ]
remove_tags_after = dict(name='p', attrs={'class':'copyright'})
remove_tags = [
dict(name='div', attrs={'id':'floating-con'})
]
feeds = [
(u'UK' , u'http://www.ft.com/rss/home/uk' )
,(u'US' , u'http://www.ft.com/rss/home/us' )
,(u'Asia' , u'http://www.ft.com/rss/home/asia' )
,(u'Middle East', u'http://www.ft.com/rss/home/middleeast')
extra_css = '''
body{font-family:Arial,Helvetica,sans-serif;}
h2(font-size:large;}
.ft-story-header(font-size:xx-small;}
.ft-story-body(font-size:small;}
a{color:#003399;}
.container{font-size:x-small;}
h3{font-size:x-small;color:#003399;}
'''
feeds = [
(u'UK' , u'http://www.ft.com/rss/home/uk' )
,(u'US' , u'http://www.ft.com/rss/home/us' )
,(u'Asia' , u'http://www.ft.com/rss/home/asia' )
,(u'Middle East', u'http://www.ft.com/rss/home/middleeast')
]
def preprocess_html(self, soup):
content_type = soup.find('meta', {'http-equiv':'Content-Type'})
if content_type:
content_type['content'] = 'text/html; charset=utf-8'
return soup
return soup

View File

@ -33,19 +33,21 @@ class Newsweek(BasicNewsRecipe):
language = 'en'
remove_tags = [
{'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
{'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
'inline-social-links-wrapper', 'email-article','ToolBox',
'inline-promo-link', 'sponsorship',
'inlineComponentRight',
'comments-and-social-links-wrapper', 'EmailArticleBlock']},
{'id' : ['footer', 'ticker-data', 'topTenVertical',
'digg-top-five', 'mesothorax', 'nw-comments',
'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing',
'ToolBox', 'EmailMain']},
{'class': re.compile('related-cloud')},
dict(name='li', attrs={'id':['slug_bigbox']})
]
keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent','photoBox']}, ]
keep_only_tags = [{'class':['article HorizontalHeader',
'articlecontent','photoBox', 'article columnist first']}, ]
recursions = 1
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']

View File

@ -83,9 +83,7 @@ class CYBOOKG3(USBMS):
def can_handle(cls, device_info, debug=False):
USBMS.can_handle(device_info, debug)
if islinux:
if device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3':
return True
return False
return device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3'
return True
@ -94,7 +92,7 @@ class CYBOOK_OPUS(CYBOOKG3):
name = 'Cybook Opus Device Interface'
gui_name = 'Cybook Opus'
description = _('Communicate with the Cybook Opus eBook reader.')
author = _('John Schember')
author = 'John Schember'
supported_platforms = ['windows', 'osx', 'linux']
FORMATS = ['epub', 'pdf', 'txt']
@ -118,7 +116,5 @@ class CYBOOK_OPUS(CYBOOKG3):
def can_handle(cls, device_info, debug=False):
USBMS.can_handle(device_info, debug)
if islinux:
if device_info[3] == 'Bookeen':
return True
return False
return device_info[3] == 'Bookeen'
return True

View File

@ -6,6 +6,8 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys
from lxml import etree
class Font(object):
@ -23,6 +25,8 @@ class Text(object):
self.font_map = font_map
self.top, self.left, self.width, self.height = map(float, map(text.get,
('top', 'left', 'width', 'height')))
self.bottom = self.top + self.height
self.right = self.left + self.width
self.font = self.font_map[text.get('font')]
self.font_size = self.font.size
self.color = self.font.color
@ -31,6 +35,58 @@ class Text(object):
self.text_as_string = etree.tostring(text, method='text',
encoding=unicode)
class FontSizeStats(dict):
def __init__(self, stats):
total = float(sum(stats.values()))
self.most_common_size, self.chars_at_most_common_size = -1, 0
for sz, chars in stats.items():
if chars >= self.chars_at_most_common_size:
self.most_common_size, self.chars_at_most_common_size = sz, chars
self[sz] = chars/total
class Interval(object):
def __init__(self, left, right):
self.left, self.right = left, right
self.width = right - left
def intersection(self, other):
left = max(self.left, other.left)
right = min(self.right, other.right)
return Interval(left, right)
def __nonzero__(self):
return self.width > 0
def __eq__(self, other):
return self.left == other.left and self.right == other.right
def __hash__(self):
return hash('(%f,%f)'%self.left, self.right)
class HorizontalBox(object):
def __init__(self, base_text):
self.texts = [base_text]
self.bottom = base_text.bottom
self.number_of_columns = None
self.column_map = {}
def append(self, t):
self.texts.append(t)
def sort(self):
self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left))
self.top, self.bottom = sys.maxint, 0
for t in self.texts:
self.top = min(self.top, t.top)
self.bottom = max(self.bottom, t.bottom)
self.left = self.texts[0].left
self.right = self.texts[-1].right
class Page(object):
def __init__(self, page, font_map, opts, log):
@ -42,9 +98,60 @@ class Page(object):
self.id = 'page%d'%self.number
self.texts = []
self.left_margin, self.right_margin = self.width, 0
for text in page.xpath('descendant::text'):
self.texts.append(Text(text, self.font_map, self.opts, self.log))
self.left_margin = min(text.left, self.left_margin)
self.right_margin = max(text.right, self.right_margin)
self.textwidth = self.right_margin - self.left_margin
self.font_size_stats = {}
for t in self.texts:
if t.font_size not in self.font_size_stats:
self.font_size_stats[t.font_size] = 0
self.font_size_stats[t.font_size] += len(t.text_as_string)
self.font_size_stats = FontSizeStats(self.font_size_stats)
self.identify_columns()
def sort_into_horizontal_boxes(self, document_font_size_stats):
self.horizontal_boxes = []
def find_closest_match(text):
'Return horizontal box whose bottom is closest to text or None'
min, ans = 3.1, None
for hb in self.horizontal_boxes:
diff = abs(text.bottom - hb.bottom)
if diff < min:
diff, ans = min, hb
return ans
for t in self.texts:
hb = find_closest_match(t)
if hb is None:
self.horizontal_boxes.append(HorizontalBox(t))
else:
hb.append(t)
for hb in self.horizontal_boxes:
hb.sort()
self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom))
def identify_columns(self):
def neighborhood(i):
if i == 0:
return self.horizontal_boxes[1:3]
return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1])
for i, hbox in enumerate(self.horizontal_boxes):
pass
class PDFDocument(object):
@ -69,6 +176,20 @@ class PDFDocument(object):
self.page_map[page.id] = page
self.pages.append(page)
self.collect_font_statistics()
for page in self.pages:
page.sort_into_horizontal_boxes(self.font_size_stats)
def collect_font_statistics(self):
self.font_size_stats = {}
for p in self.pages:
for sz, chars in p.font_size_stats:
if sz not in self.font_size_stats:
self.font_size_stats[sz] = 0
self.font_size_stats[sz] += chars
self.font_size_stats = FontSizeStats(self.font_size_stats)

View File

@ -228,7 +228,7 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
self.connect(self.action_bookmark, SIGNAL('triggered(bool)'), self.bookmark)
self.connect(self.action_forward, SIGNAL('triggered(bool)'), self.forward)
self.connect(self.action_preferences, SIGNAL('triggered(bool)'), lambda x: self.view.config(self))
self.connect(self.pos, SIGNAL('valueChanged(double)'), self.goto_page)
self.pos.editingFinished.connect(self.goto_page_num)
self.connect(self.vertical_scrollbar, SIGNAL('valueChanged(int)'),
lambda x: self.goto_page(x/100.))
self.connect(self.search, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), self.find)
@ -319,6 +319,9 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
if pos is not None:
self.goto_page(pos)
def goto_page_num(self):
num = self.pos.value()
self.goto_page(num)
def forward(self, x):
pos = self.history.forward()