Merge from trunk

This commit is contained in:
Charles Haley 2011-02-13 10:08:22 +00:00
commit a1fc82d91c
3 changed files with 208 additions and 3 deletions

View File

@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class AppleDaily(BasicNewsRecipe):
title = u'蘋果日報'
__author__ = u'蘋果日報'
__publisher__ = u'蘋果日報'
description = u'蘋果日報'
masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
language = 'zh_TW'
encoding = 'UTF-8'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = False
remove_javascript = True
remove_tags_before = dict(name=['ul', 'h1'])
remove_tags_after = dict(name='form')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style', 'form'])]
no_stylesheets = True
extra_css = '''
@font-face {font-family: "uming", serif, sans-serif; src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
body {margin-right: 8pt; font-family: 'uming', serif;}
h1 {font-family: 'uming', serif, sans-serif}
'''
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
preprocess_regexps = [
(re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
]
def get_cover_url(self):
return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
#def get_browser(self):
#br = BasicNewsRecipe.get_browser()
#if self.username is not None and self.password is not None:
# br.open('http://www.nytimes.com/auth/login')
# br.select_form(name='login')
# br['USERID'] = self.username
# br['PASSWORD'] = self.password
# br.submit()
#return br
def preprocess_html(self, soup):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
#print 'checking image: ' + iurl
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
m = p.search(iurl)
if m is not None:
iurl = 'http://' + m.group('server') + '/' + m.group('path')
#print 'working! new url: ' + iurl
tag['src'] = iurl
#else:
#print 'not good'
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
iurl = tag['href']
#print 'checking image: ' + iurl
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
m = p.search(iurl)
if m is not None:
iurl = 'http://' + m.group('server') + '/' + m.group('path')
#print 'working! new url: ' + iurl
tag['href'] = iurl
#else:
#print 'not good'
return soup
def parse_index(self):
base = 'http://news.hotpot.hk/fruit'
soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
#def feed_title(div):
# return ''.join(div.findAll(text=True, recursive=False)).strip()
articles = {}
key = None
ans = []
for div in soup.findAll('li'):
key = div.find(text=True, recursive=True);
#if key == u'豪情':
# continue;
print 'section=' + key
articles[key] = []
ans.append(key)
a = div.find('a', href=True)
if not a:
continue
url = base + '/' + a['href']
print 'url=' + url
if not articles.has_key(key):
articles[key] = []
else:
# sub page
subSoup = self.index_to_soup(url)
for subDiv in subSoup.findAll('li'):
subA = subDiv.find('a', href=True)
subTitle = subDiv.find(text=True, recursive=True)
subUrl = base + '/' + subA['href']
print 'subUrl' + subUrl
articles[key].append(
dict(title=subTitle,
url=subUrl,
date='',
description='',
content=''))
# elif div['class'] in ['story', 'story headline']:
# a = div.find('a', href=True)
# if not a:
# continue
# url = re.sub(r'\?.*', '', a['href'])
# url += '?pagewanted=all'
# title = self.tag_to_string(a, use_alt=True).strip()
# description = ''
# pubdate = strftime('%a, %d %b')
# summary = div.find(True, attrs={'class':'summary'})
# if summary:
# description = self.tag_to_string(summary, use_alt=False)
#
# feed = key if key is not None else 'Uncategorized'
# if not articles.has_key(feed):
# articles[feed] = []
# if not 'podcasts' in url:
# articles[feed].append(
# dict(title=title, url=url, date=pubdate,
# description=description,
# content=''))
# ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -15,7 +15,29 @@ from calibre.ebooks.pdb.header import PdbHeaderReader
class APNXBuilder(object): class APNXBuilder(object):
''' '''
Currently uses the Adobe 1024 byte count equal one page formula. 2300 characters of uncompressed text per page. This is
not meant to map 1 to 1 to a print book but to be a
close enough measure.
A test book was chosen and the characters were counted
on one page. This number was round to 2240 then 60
characters of markup were added to the total giving
2300.
Uncompressed text length is used because it's easily
accessible in MOBI files (part of the header). Also,
It's faster to work off of the length then to
decompress and parse the actual text.
A better but much more resource intensive and slower
method to calculate the page length would be to parse
the uncompressed text. For each paragraph we would
want to find how many lines it would occupy in a paper
back book. 70 characters per line and 32 lines per page.
So divide the number of characters (minus markup) in
each paragraph by 70. If there are less than 70
characters in the paragraph then it is 1 line. Then,
count every 32 lines and mark that location as a page.
''' '''
def write_apnx(self, mobi_file_path, apnx_path): def write_apnx(self, mobi_file_path, apnx_path):
@ -63,6 +85,6 @@ class APNXBuilder(object):
while count < text_length: while count < text_length:
pages.append(count) pages.append(count)
count += 1024 count += 2300
return pages return pages

View File

@ -232,16 +232,37 @@ class Device(DeviceConfig, DevicePlugin):
time.sleep(5) time.sleep(5)
drives = {} drives = {}
seen = set()
prod_pat = re.compile(r'PROD_(.+?)&')
dup_prod_id = False
def check_for_dups(pnp_id):
try:
match = prod_pat.search(pnp_id)
if match is not None:
prodid = match.group(1)
if prodid in seen:
return True
else:
seen.add(prodid)
except:
pass
return False
for drive, pnp_id in win_pnp_drives().items(): for drive, pnp_id in win_pnp_drives().items():
if self.windows_match_device(pnp_id, 'WINDOWS_CARD_A_MEM') and \ if self.windows_match_device(pnp_id, 'WINDOWS_CARD_A_MEM') and \
not drives.get('carda', False): not drives.get('carda', False):
drives['carda'] = drive drives['carda'] = drive
dup_prod_id |= check_for_dups(pnp_id)
elif self.windows_match_device(pnp_id, 'WINDOWS_CARD_B_MEM') and \ elif self.windows_match_device(pnp_id, 'WINDOWS_CARD_B_MEM') and \
not drives.get('cardb', False): not drives.get('cardb', False):
drives['cardb'] = drive drives['cardb'] = drive
dup_prod_id |= check_for_dups(pnp_id)
elif self.windows_match_device(pnp_id, 'WINDOWS_MAIN_MEM') and \ elif self.windows_match_device(pnp_id, 'WINDOWS_MAIN_MEM') and \
not drives.get('main', False): not drives.get('main', False):
drives['main'] = drive drives['main'] = drive
dup_prod_id |= check_for_dups(pnp_id)
if 'main' in drives.keys() and 'carda' in drives.keys() and \ if 'main' in drives.keys() and 'carda' in drives.keys() and \
'cardb' in drives.keys(): 'cardb' in drives.keys():
@ -263,7 +284,8 @@ class Device(DeviceConfig, DevicePlugin):
# Sort drives by their PNP drive numbers if the CARD and MAIN # Sort drives by their PNP drive numbers if the CARD and MAIN
# MEM strings are identical # MEM strings are identical
if self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM, if dup_prod_id or \
self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM,
self.WINDOWS_CARD_B_MEM) or \ self.WINDOWS_CARD_B_MEM) or \
self.WINDOWS_CARD_A_MEM == self.WINDOWS_CARD_B_MEM: self.WINDOWS_CARD_A_MEM == self.WINDOWS_CARD_B_MEM:
letters = sorted(drives.values(), cmp=drivecmp) letters = sorted(drives.values(), cmp=drivecmp)