mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to Trunk.
This commit is contained in:
commit
83d4deac87
@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class FinancialTimes(BasicNewsRecipe):
|
||||
title = u'Financial Times'
|
||||
__author__ = 'Darko Miletic'
|
||||
__author__ = 'Darko Miletic and Sujata Raman'
|
||||
description = 'Financial world news'
|
||||
oldest_article = 2
|
||||
language = 'en'
|
||||
@ -21,8 +21,9 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
needs_subscription = True
|
||||
simultaneous_downloads= 1
|
||||
delay = 1
|
||||
|
||||
LOGIN = 'https://registration.ft.com/registration/barrier/login'
|
||||
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
@ -32,22 +33,31 @@ class FinancialTimes(BasicNewsRecipe):
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
|
||||
keep_only_tags = [ dict(name='div', attrs={'id':'cont'}) ]
|
||||
remove_tags_after = dict(name='p', attrs={'class':'copyright'})
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'floating-con'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'UK' , u'http://www.ft.com/rss/home/uk' )
|
||||
,(u'US' , u'http://www.ft.com/rss/home/us' )
|
||||
,(u'Asia' , u'http://www.ft.com/rss/home/asia' )
|
||||
,(u'Middle East', u'http://www.ft.com/rss/home/middleeast')
|
||||
|
||||
extra_css = '''
|
||||
body{font-family:Arial,Helvetica,sans-serif;}
|
||||
h2(font-size:large;}
|
||||
.ft-story-header(font-size:xx-small;}
|
||||
.ft-story-body(font-size:small;}
|
||||
a{color:#003399;}
|
||||
.container{font-size:x-small;}
|
||||
h3{font-size:x-small;color:#003399;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'UK' , u'http://www.ft.com/rss/home/uk' )
|
||||
,(u'US' , u'http://www.ft.com/rss/home/us' )
|
||||
,(u'Asia' , u'http://www.ft.com/rss/home/asia' )
|
||||
,(u'Middle East', u'http://www.ft.com/rss/home/middleeast')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
content_type = soup.find('meta', {'http-equiv':'Content-Type'})
|
||||
if content_type:
|
||||
content_type['content'] = 'text/html; charset=utf-8'
|
||||
return soup
|
||||
return soup
|
||||
|
@ -33,19 +33,21 @@ class Newsweek(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
|
||||
remove_tags = [
|
||||
{'class':['fwArticle noHr','fwArticle','subinfo','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
|
||||
{'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
|
||||
'inline-social-links-wrapper', 'email-article','ToolBox',
|
||||
'inline-promo-link', 'sponsorship',
|
||||
'inlineComponentRight',
|
||||
'comments-and-social-links-wrapper', 'EmailArticleBlock']},
|
||||
{'id' : ['footer', 'ticker-data', 'topTenVertical',
|
||||
'digg-top-five', 'mesothorax', 'nw-comments',
|
||||
'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing',
|
||||
'ToolBox', 'EmailMain']},
|
||||
{'class': re.compile('related-cloud')},
|
||||
dict(name='li', attrs={'id':['slug_bigbox']})
|
||||
]
|
||||
|
||||
|
||||
keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent','photoBox']}, ]
|
||||
keep_only_tags = [{'class':['article HorizontalHeader',
|
||||
'articlecontent','photoBox', 'article columnist first']}, ]
|
||||
recursions = 1
|
||||
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
||||
|
||||
|
@ -83,9 +83,7 @@ class CYBOOKG3(USBMS):
|
||||
def can_handle(cls, device_info, debug=False):
|
||||
USBMS.can_handle(device_info, debug)
|
||||
if islinux:
|
||||
if device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3':
|
||||
return True
|
||||
return False
|
||||
return device_info[3] == 'Bookeen' and device_info[4] == 'Cybook Gen3'
|
||||
return True
|
||||
|
||||
|
||||
@ -94,7 +92,7 @@ class CYBOOK_OPUS(CYBOOKG3):
|
||||
name = 'Cybook Opus Device Interface'
|
||||
gui_name = 'Cybook Opus'
|
||||
description = _('Communicate with the Cybook Opus eBook reader.')
|
||||
author = _('John Schember')
|
||||
author = 'John Schember'
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
FORMATS = ['epub', 'pdf', 'txt']
|
||||
@ -118,7 +116,5 @@ class CYBOOK_OPUS(CYBOOKG3):
|
||||
def can_handle(cls, device_info, debug=False):
|
||||
USBMS.can_handle(device_info, debug)
|
||||
if islinux:
|
||||
if device_info[3] == 'Bookeen':
|
||||
return True
|
||||
return False
|
||||
return device_info[3] == 'Bookeen'
|
||||
return True
|
||||
|
@ -6,6 +6,8 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys
|
||||
|
||||
from lxml import etree
|
||||
|
||||
class Font(object):
|
||||
@ -23,6 +25,8 @@ class Text(object):
|
||||
self.font_map = font_map
|
||||
self.top, self.left, self.width, self.height = map(float, map(text.get,
|
||||
('top', 'left', 'width', 'height')))
|
||||
self.bottom = self.top + self.height
|
||||
self.right = self.left + self.width
|
||||
self.font = self.font_map[text.get('font')]
|
||||
self.font_size = self.font.size
|
||||
self.color = self.font.color
|
||||
@ -31,6 +35,58 @@ class Text(object):
|
||||
self.text_as_string = etree.tostring(text, method='text',
|
||||
encoding=unicode)
|
||||
|
||||
class FontSizeStats(dict):
|
||||
|
||||
def __init__(self, stats):
|
||||
total = float(sum(stats.values()))
|
||||
self.most_common_size, self.chars_at_most_common_size = -1, 0
|
||||
|
||||
for sz, chars in stats.items():
|
||||
if chars >= self.chars_at_most_common_size:
|
||||
self.most_common_size, self.chars_at_most_common_size = sz, chars
|
||||
self[sz] = chars/total
|
||||
|
||||
class Interval(object):
|
||||
|
||||
def __init__(self, left, right):
|
||||
self.left, self.right = left, right
|
||||
self.width = right - left
|
||||
|
||||
def intersection(self, other):
|
||||
left = max(self.left, other.left)
|
||||
right = min(self.right, other.right)
|
||||
return Interval(left, right)
|
||||
|
||||
def __nonzero__(self):
|
||||
return self.width > 0
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.left == other.left and self.right == other.right
|
||||
|
||||
def __hash__(self):
|
||||
return hash('(%f,%f)'%self.left, self.right)
|
||||
|
||||
|
||||
class HorizontalBox(object):
|
||||
|
||||
def __init__(self, base_text):
|
||||
self.texts = [base_text]
|
||||
self.bottom = base_text.bottom
|
||||
self.number_of_columns = None
|
||||
self.column_map = {}
|
||||
|
||||
def append(self, t):
|
||||
self.texts.append(t)
|
||||
|
||||
def sort(self):
|
||||
self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left))
|
||||
self.top, self.bottom = sys.maxint, 0
|
||||
for t in self.texts:
|
||||
self.top = min(self.top, t.top)
|
||||
self.bottom = max(self.bottom, t.bottom)
|
||||
self.left = self.texts[0].left
|
||||
self.right = self.texts[-1].right
|
||||
|
||||
class Page(object):
|
||||
|
||||
def __init__(self, page, font_map, opts, log):
|
||||
@ -42,9 +98,60 @@ class Page(object):
|
||||
self.id = 'page%d'%self.number
|
||||
|
||||
self.texts = []
|
||||
self.left_margin, self.right_margin = self.width, 0
|
||||
|
||||
for text in page.xpath('descendant::text'):
|
||||
self.texts.append(Text(text, self.font_map, self.opts, self.log))
|
||||
self.left_margin = min(text.left, self.left_margin)
|
||||
self.right_margin = max(text.right, self.right_margin)
|
||||
|
||||
self.textwidth = self.right_margin - self.left_margin
|
||||
|
||||
self.font_size_stats = {}
|
||||
for t in self.texts:
|
||||
if t.font_size not in self.font_size_stats:
|
||||
self.font_size_stats[t.font_size] = 0
|
||||
self.font_size_stats[t.font_size] += len(t.text_as_string)
|
||||
|
||||
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
||||
|
||||
self.identify_columns()
|
||||
|
||||
def sort_into_horizontal_boxes(self, document_font_size_stats):
|
||||
self.horizontal_boxes = []
|
||||
|
||||
def find_closest_match(text):
|
||||
'Return horizontal box whose bottom is closest to text or None'
|
||||
min, ans = 3.1, None
|
||||
for hb in self.horizontal_boxes:
|
||||
diff = abs(text.bottom - hb.bottom)
|
||||
if diff < min:
|
||||
diff, ans = min, hb
|
||||
return ans
|
||||
|
||||
for t in self.texts:
|
||||
hb = find_closest_match(t)
|
||||
if hb is None:
|
||||
self.horizontal_boxes.append(HorizontalBox(t))
|
||||
else:
|
||||
hb.append(t)
|
||||
|
||||
|
||||
for hb in self.horizontal_boxes:
|
||||
hb.sort()
|
||||
|
||||
self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom))
|
||||
|
||||
def identify_columns(self):
|
||||
|
||||
def neighborhood(i):
|
||||
if i == 0:
|
||||
return self.horizontal_boxes[1:3]
|
||||
return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1])
|
||||
|
||||
for i, hbox in enumerate(self.horizontal_boxes):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
class PDFDocument(object):
|
||||
@ -69,6 +176,20 @@ class PDFDocument(object):
|
||||
self.page_map[page.id] = page
|
||||
self.pages.append(page)
|
||||
|
||||
self.collect_font_statistics()
|
||||
|
||||
for page in self.pages:
|
||||
page.sort_into_horizontal_boxes(self.font_size_stats)
|
||||
|
||||
def collect_font_statistics(self):
|
||||
self.font_size_stats = {}
|
||||
for p in self.pages:
|
||||
for sz, chars in p.font_size_stats:
|
||||
if sz not in self.font_size_stats:
|
||||
self.font_size_stats[sz] = 0
|
||||
self.font_size_stats[sz] += chars
|
||||
|
||||
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
||||
|
||||
|
||||
|
||||
|
@ -228,7 +228,7 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
self.connect(self.action_bookmark, SIGNAL('triggered(bool)'), self.bookmark)
|
||||
self.connect(self.action_forward, SIGNAL('triggered(bool)'), self.forward)
|
||||
self.connect(self.action_preferences, SIGNAL('triggered(bool)'), lambda x: self.view.config(self))
|
||||
self.connect(self.pos, SIGNAL('valueChanged(double)'), self.goto_page)
|
||||
self.pos.editingFinished.connect(self.goto_page_num)
|
||||
self.connect(self.vertical_scrollbar, SIGNAL('valueChanged(int)'),
|
||||
lambda x: self.goto_page(x/100.))
|
||||
self.connect(self.search, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'), self.find)
|
||||
@ -319,6 +319,9 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
|
||||
if pos is not None:
|
||||
self.goto_page(pos)
|
||||
|
||||
def goto_page_num(self):
|
||||
num = self.pos.value()
|
||||
self.goto_page(num)
|
||||
|
||||
def forward(self, x):
|
||||
pos = self.history.forward()
|
||||
|
Loading…
x
Reference in New Issue
Block a user