From 44ae9f85b05eb9820eac83429738ae6dd4050c34 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 12 Feb 2011 09:37:59 -0500 Subject: [PATCH 1/4] Kindle Interface: Tweak APNX generation to estimage a page. --- src/calibre/devices/kindle/apnx.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/kindle/apnx.py b/src/calibre/devices/kindle/apnx.py index 328d3a97a6..0c79a3829f 100644 --- a/src/calibre/devices/kindle/apnx.py +++ b/src/calibre/devices/kindle/apnx.py @@ -15,7 +15,29 @@ from calibre.ebooks.pdb.header import PdbHeaderReader class APNXBuilder(object): ''' - Currently uses the Adobe 1024 byte count equal one page formula. + 2300 characters of uncompressed text per page. This is + not meant to map 1 to 1 to a print book but to be a + close enough measure. + + A test book was chosen and the characters were counted + on one page. This number was round to 2240 then 60 + characters of markup were added to the total giving + 2300. + + Uncompressed text length is used because it's easily + accessible in MOBI files (part of the header). Also, + It's faster to work off of the length then to + decompress and parse the actual text. + + A better but much more resource intensive and slower + method to calculate the page length would be to parse + the uncompressed text. For each paragraph we would + want to find how many lines it would occupy in a paper + back book. 70 charaters per line and 32 lines per page. + So divide the number of characters (minus markup) in + each paragraph by 70. If there are less than 70 + characters in the paragraph then it is 1 line. Then, + count every 32 lines and mark that location as a page. ''' def write_apnx(self, mobi_file_path, apnx_path): @@ -63,6 +85,6 @@ class APNXBuilder(object): while count < text_length: pages.append(count) - count += 1024 + count += 2300 return pages From 527bce3e5ed3817a5c7184c9dd63f03884ea1f2c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 12 Feb 2011 09:40:21 -0500 Subject: [PATCH 2/4] ... --- src/calibre/devices/kindle/apnx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/kindle/apnx.py b/src/calibre/devices/kindle/apnx.py index 0c79a3829f..d8dc9709d9 100644 --- a/src/calibre/devices/kindle/apnx.py +++ b/src/calibre/devices/kindle/apnx.py @@ -33,7 +33,7 @@ class APNXBuilder(object): method to calculate the page length would be to parse the uncompressed text. For each paragraph we would want to find how many lines it would occupy in a paper - back book. 70 charaters per line and 32 lines per page. + back book. 70 characters per line and 32 lines per page. So divide the number of characters (minus markup) in each paragraph by 70. If there are less than 70 characters in the paragraph then it is 1 line. Then, From 5ca7a4a8845a69e67456df86c16921c1bd089453 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 12 Feb 2011 11:14:45 -0700 Subject: [PATCH 3/4] Fix #8916 (Cybook Orizon connection) --- src/calibre/devices/usbms/device.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index a31897c8e5..b0857de909 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -232,16 +232,37 @@ class Device(DeviceConfig, DevicePlugin): time.sleep(5) drives = {} + seen = set() + prod_pat = re.compile(r'PROD_(.+?)&') + dup_prod_id = False + + def check_for_dups(pnp_id): + try: + match = prod_pat.search(pnp_id) + if match is not None: + prodid = match.group(1) + if prodid in seen: + return True + else: + seen.add(prodid) + except: + pass + return False + + for drive, pnp_id in win_pnp_drives().items(): if self.windows_match_device(pnp_id, 'WINDOWS_CARD_A_MEM') and \ not drives.get('carda', False): drives['carda'] = drive + dup_prod_id |= check_for_dups(pnp_id) elif self.windows_match_device(pnp_id, 'WINDOWS_CARD_B_MEM') and \ not drives.get('cardb', False): drives['cardb'] = drive + dup_prod_id |= check_for_dups(pnp_id) elif self.windows_match_device(pnp_id, 'WINDOWS_MAIN_MEM') and \ not drives.get('main', False): drives['main'] = drive + dup_prod_id |= check_for_dups(pnp_id) if 'main' in drives.keys() and 'carda' in drives.keys() and \ 'cardb' in drives.keys(): @@ -263,7 +284,8 @@ class Device(DeviceConfig, DevicePlugin): # Sort drives by their PNP drive numbers if the CARD and MAIN # MEM strings are identical - if self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM, + if dup_prod_id or \ + self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM, self.WINDOWS_CARD_B_MEM) or \ self.WINDOWS_CARD_A_MEM == self.WINDOWS_CARD_B_MEM: letters = sorted(drives.values(), cmp=drivecmp) From cc2f6b8d5eec722999059715a99b5a060c9958b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 12 Feb 2011 22:08:38 -0700 Subject: [PATCH 4/4] Fix #405 (New news feed). Apple daily by MrLai --- resources/recipes/apple_daily.recipe | 161 +++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 resources/recipes/apple_daily.recipe diff --git a/resources/recipes/apple_daily.recipe b/resources/recipes/apple_daily.recipe new file mode 100644 index 0000000000..1e9953af43 --- /dev/null +++ b/resources/recipes/apple_daily.recipe @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class AppleDaily(BasicNewsRecipe): + + title = u'蘋果日報' + __author__ = u'蘋果日報' + __publisher__ = u'蘋果日報' + description = u'蘋果日報' + masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif' + language = 'zh_TW' + encoding = 'UTF-8' + timefmt = ' [%a, %d %b, %Y]' + needs_subscription = False + remove_javascript = True + remove_tags_before = dict(name=['ul', 'h1']) + remove_tags_after = dict(name='form') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), + dict(name=['script', 'noscript', 'style', 'form'])] + no_stylesheets = True + extra_css = ''' + @font-face {font-family: "uming", serif, sans-serif; src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n + body {margin-right: 8pt; font-family: 'uming', serif;} + h1 {font-family: 'uming', serif, sans-serif} + ''' + #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' + + preprocess_regexps = [ + (re.compile(r'img.php?server=(?P[^&]+)&path=(?P[^&]+).*', re.DOTALL|re.IGNORECASE), + lambda match: 'http://' + match.group('server') + '/' + match.group('path')), + ] + + def get_cover_url(self): + return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif' + + + #def get_browser(self): + #br = BasicNewsRecipe.get_browser() + #if self.username is not None and self.password is not None: + # br.open('http://www.nytimes.com/auth/login') + # br.select_form(name='login') + # br['USERID'] = self.username + # br['PASSWORD'] = self.password + # br.submit() + #return br + + def preprocess_html(self, soup): + #process all the images + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + #print 'checking image: ' + iurl + + #img\.php?server\=(?P[^&]+)&path=(?P[^&]+) + p = re.compile(r'img\.php\?server=(?P[^&]+)&path=(?P[^&]+)', re.DOTALL|re.IGNORECASE) + + m = p.search(iurl) + + if m is not None: + iurl = 'http://' + m.group('server') + '/' + m.group('path') + #print 'working! new url: ' + iurl + tag['src'] = iurl + #else: + #print 'not good' + + for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): + iurl = tag['href'] + #print 'checking image: ' + iurl + + #img\.php?server\=(?P[^&]+)&path=(?P[^&]+) + p = re.compile(r'img\.php\?server=(?P[^&]+)&path=(?P[^&]+)', re.DOTALL|re.IGNORECASE) + + m = p.search(iurl) + + if m is not None: + iurl = 'http://' + m.group('server') + '/' + m.group('path') + #print 'working! new url: ' + iurl + tag['href'] = iurl + #else: + #print 'not good' + + return soup + + + def parse_index(self): + base = 'http://news.hotpot.hk/fruit' + soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php') + + #def feed_title(div): + # return ''.join(div.findAll(text=True, recursive=False)).strip() + + articles = {} + key = None + ans = [] + for div in soup.findAll('li'): + key = div.find(text=True, recursive=True); + #if key == u'豪情': + # continue; + + print 'section=' + key + + articles[key] = [] + + ans.append(key) + + a = div.find('a', href=True) + + if not a: + continue + + url = base + '/' + a['href'] + print 'url=' + url + + if not articles.has_key(key): + articles[key] = [] + else: + # sub page + subSoup = self.index_to_soup(url) + + for subDiv in subSoup.findAll('li'): + subA = subDiv.find('a', href=True) + subTitle = subDiv.find(text=True, recursive=True) + subUrl = base + '/' + subA['href'] + + print 'subUrl' + subUrl + + articles[key].append( + dict(title=subTitle, + url=subUrl, + date='', + description='', + content='')) + + +# elif div['class'] in ['story', 'story headline']: +# a = div.find('a', href=True) +# if not a: +# continue +# url = re.sub(r'\?.*', '', a['href']) +# url += '?pagewanted=all' +# title = self.tag_to_string(a, use_alt=True).strip() +# description = '' +# pubdate = strftime('%a, %d %b') +# summary = div.find(True, attrs={'class':'summary'}) +# if summary: +# description = self.tag_to_string(summary, use_alt=False) +# +# feed = key if key is not None else 'Uncategorized' +# if not articles.has_key(feed): +# articles[feed] = [] +# if not 'podcasts' in url: +# articles[feed].append( +# dict(title=title, url=url, date=pubdate, +# description=description, +# content='')) +# ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2}) + ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)] + return ans + +