Merge from trunk

2025-07-09 03:04:10 -04:00 · 2011-02-13 10:08:22 +00:00 · 2011-02-13 10:08:22 +00:00 · a1fc82d91c
commit a1fc82d91c
parent b2669d5ba4 cc2f6b8d5e
3 changed files with 208 additions and 3 deletions
--- a/resources/recipes/apple_daily.recipe
+++ b/resources/recipes/apple_daily.recipe
@ -0,0 +1,161 @@
 # -*- coding: utf-8 -*-
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class AppleDaily(BasicNewsRecipe):
    title       = u'蘋果日報'
    __author__  = u'蘋果日報'
    __publisher__  = u'蘋果日報'
    description = u'蘋果日報'
    masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
    language = 'zh_TW'
    encoding = 'UTF-8'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = False
    remove_javascript = True
    remove_tags_before = dict(name=['ul', 'h1'])
    remove_tags_after  = dict(name='form')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
                dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
                dict(name=['script', 'noscript', 'style', 'form'])]
    no_stylesheets = True
    extra_css = '''
    	@font-face {font-family: "uming", serif, sans-serif;  src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
 	    body {margin-right: 8pt; font-family: 'uming', serif;}
        h1 {font-family: 'uming', serif, sans-serif}
            '''
    #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
    preprocess_regexps = [
       (re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
        lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
    ]
    def get_cover_url(self):
        return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
    #def get_browser(self):
        #br = BasicNewsRecipe.get_browser()
        #if self.username is not None and self.password is not None:
        #    br.open('http://www.nytimes.com/auth/login')
        #    br.select_form(name='login')
        #    br['USERID']   = self.username
        #    br['PASSWORD'] = self.password
        #    br.submit()
        #return br
    def preprocess_html(self, soup):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            #print 'checking image: ' + iurl
            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
            m = p.search(iurl)
            if m is not None:
                iurl = 'http://' + m.group('server') + '/' + m.group('path')
                #print 'working! new url: ' + iurl
                tag['src'] = iurl
            #else:
                #print 'not good'
        for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
            iurl = tag['href']
            #print 'checking image: ' + iurl
            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
            m = p.search(iurl)
            if m is not None:
                iurl = 'http://' + m.group('server') + '/' + m.group('path')
                #print 'working! new url: ' + iurl
                tag['href'] = iurl
            #else:
                #print 'not good'
        return soup
    def parse_index(self):
        base = 'http://news.hotpot.hk/fruit'
        soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
        #def feed_title(div):
        #    return ''.join(div.findAll(text=True, recursive=False)).strip()
        articles = {}
        key = None
        ans = []
        for div in soup.findAll('li'):
            key = div.find(text=True, recursive=True);
            #if key == u'豪情':
           #    continue;
            print 'section=' + key
            articles[key] = []
            ans.append(key)
            a = div.find('a', href=True)
            if not a:
                continue
            url = base + '/' + a['href']
            print 'url=' + url
            if not articles.has_key(key):
                articles[key] = []
            else:
                # sub page
                subSoup = self.index_to_soup(url)
                for subDiv in subSoup.findAll('li'):
                    subA = subDiv.find('a', href=True)
                    subTitle = subDiv.find(text=True, recursive=True)
                    subUrl = base + '/' + subA['href']
                    print 'subUrl' + subUrl
                    articles[key].append(
                        dict(title=subTitle,
                         url=subUrl,
                         date='',
                         description='',
                         content=''))
 #             elif div['class'] in ['story', 'story headline']:
 #                 a = div.find('a', href=True)
 #                 if not a:
 #                     continue
 #                 url = re.sub(r'\?.*', '', a['href'])
 #                 url += '?pagewanted=all'
 #                 title = self.tag_to_string(a, use_alt=True).strip()
 #                 description = ''
 #                 pubdate = strftime('%a, %d %b')
 #                 summary = div.find(True, attrs={'class':'summary'})
 #                 if summary:
 #                     description = self.tag_to_string(summary, use_alt=False)
 #
 #                 feed = key if key is not None else 'Uncategorized'
 #                 if not articles.has_key(feed):
 #                     articles[feed] = []
 #                 if not 'podcasts' in url:
 #                     articles[feed].append(
 #                               dict(title=title, url=url, date=pubdate,
 #                                    description=description,
 #                                    content=''))
 #        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
        ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/src/calibre/devices/kindle/apnx.py
+++ b/src/calibre/devices/kindle/apnx.py
@ -15,7 +15,29 @@ from calibre.ebooks.pdb.header import PdbHeaderReader
 class APNXBuilder(object):
    '''
-    Currently uses the Adobe 1024 byte count equal one page formula.
+    2300 characters of uncompressed text per page. This is
    not meant to map 1 to 1 to a print book but to be a
    close enough measure.
    A test book was chosen and the characters were counted
    on one page. This number was round to 2240 then 60
    characters of markup were added to the total giving
    2300.
    Uncompressed text length is used because it's easily
    accessible in MOBI files (part of the header). Also,
    It's faster to work off of the length then to
    decompress and parse the actual text.
    A better but much more resource intensive and slower
    method to calculate the page length would be to parse
    the uncompressed text. For each paragraph we would
    want to find how many lines it would occupy in a paper
    back book. 70 characters per line and 32 lines per page.
    So divide the number of characters (minus markup) in
    each paragraph by 70. If there are less than 70
    characters in the paragraph then it is 1 line. Then,
    count every 32 lines and mark that location as a page.
    '''
    def write_apnx(self, mobi_file_path, apnx_path):
@ -63,6 +85,6 @@ class APNXBuilder(object):
        while count < text_length:
            pages.append(count)
-            count += 1024
+            count += 2300
        return pages
--- a/src/calibre/devices/usbms/device.py
+++ b/src/calibre/devices/usbms/device.py
@ -232,16 +232,37 @@ class Device(DeviceConfig, DevicePlugin):
        time.sleep(5)
        drives = {}
        seen = set()
        prod_pat = re.compile(r'PROD_(.+?)&')
        dup_prod_id = False
        def check_for_dups(pnp_id):
            try:
                match = prod_pat.search(pnp_id)
                if match is not None:
                    prodid = match.group(1)
                    if prodid in seen:
                        return True
                    else:
                        seen.add(prodid)
            except:
                pass
            return False
        for drive, pnp_id in win_pnp_drives().items():
            if self.windows_match_device(pnp_id, 'WINDOWS_CARD_A_MEM') and \
                    not drives.get('carda', False):
                drives['carda'] = drive
                dup_prod_id |= check_for_dups(pnp_id)
            elif self.windows_match_device(pnp_id, 'WINDOWS_CARD_B_MEM') and \
                    not drives.get('cardb', False):
                drives['cardb'] = drive
                dup_prod_id |= check_for_dups(pnp_id)
            elif self.windows_match_device(pnp_id, 'WINDOWS_MAIN_MEM') and \
                    not drives.get('main', False):
                drives['main'] = drive
                dup_prod_id |= check_for_dups(pnp_id)
            if 'main' in drives.keys() and 'carda' in drives.keys() and \
                    'cardb' in drives.keys():
@ -263,7 +284,8 @@ class Device(DeviceConfig, DevicePlugin):
        # Sort drives by their PNP drive numbers if the CARD and MAIN
        # MEM strings are identical
-        if self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM,
+        if dup_prod_id or \
                self.WINDOWS_MAIN_MEM in (self.WINDOWS_CARD_A_MEM,
                self.WINDOWS_CARD_B_MEM) or \
                self.WINDOWS_CARD_A_MEM == self.WINDOWS_CARD_B_MEM:
            letters = sorted(drives.values(), cmp=drivecmp)