From 9161d924aad670573e5c997df65d9b215bc19be7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 2 Apr 2013 14:00:16 +0530 Subject: [PATCH 1/8] Update Business Week --- recipes/bwmagazine.recipe | 65 ++++----------------------------------- 1 file changed, 6 insertions(+), 59 deletions(-) diff --git a/recipes/bwmagazine.recipe b/recipes/bwmagazine.recipe index d11861ce08..ae3197da81 100644 --- a/recipes/bwmagazine.recipe +++ b/recipes/bwmagazine.recipe @@ -37,68 +37,15 @@ class BusinessWeek(BasicNewsRecipe): , 'language' : language } - #remove_tags = [ - #dict(attrs={'class':'inStory'}) - #,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td']) - #,dict(attrs={'id':['inset','videoDisplay']}) - #] - #keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody']})] - remove_attributes = ['lang'] - match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*'] - feeds = [ - (u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'), - (u'Top News' , u'http://www.businessweek.com/rss/bwdaily.rss' ), - (u'Asia', u'http://www.businessweek.com/rss/asia.rss'), - (u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'), - (u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'), - (u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'), - (u'Europe', u'http://www.businessweek.com/rss/europe.rss'), - (u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'), - (u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'), - (u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'), - (u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'), - (u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'), - (u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'), - (u'Technology', u'http://www.businessweek.com/rss/technology.rss'), - (u'Investing', u'http://rss.businessweek.com/bw_rss/investor'), - (u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'), - (u'Careers', u'http://rss.businessweek.com/bw_rss/careers'), - (u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'), - (u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'), - (u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'), + (u'Top Stories', u'http://www.businessweek.com/feeds/most-popular.rss'), ] - def get_article_url(self, article): - url = article.get('guid', None) - if 'podcasts' in url: - return None - if 'surveys' in url: - return None - if 'images' in url: - return None - if 'feedroom' in url: - return None - if '/magazine/toc/' in url: - return None - rurl, sep, rest = url.rpartition('?') - if rurl: - return rurl - return rest - def print_version(self, url): - if '/news/' in url or '/blog/ in url': - return url - rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/') - return rurl.replace('/investing/','/investor/') + soup = self.index_to_soup(url) + prntver = soup.find('li', attrs={'class':'print tracked'}) + rurl = prntver.find('a', href=True)['href'] + return rurl + - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for alink in soup.findAll('a'): - if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) - return soup - From f3257d9865dd8bf23e0cd4008b0c166737424485 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 2 Apr 2013 18:55:19 +0530 Subject: [PATCH 2/8] Fix #1163272 (Text Bug in Content) --- src/calibre/translations/de.po | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/translations/de.po b/src/calibre/translations/de.po index b0e512c6bf..0655ceb4ee 100644 --- a/src/calibre/translations/de.po +++ b/src/calibre/translations/de.po @@ -22507,7 +22507,7 @@ msgstr "Autoren beginnend mit '%s'" #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3477 #, python-format msgid "Authors beginning with '%s'" -msgstr "Autoren beginnen mit mit %s" +msgstr "Autoren beginnen mit %s" #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi_builder.py:3518 msgid "NCX for Recently Added" From 7076d8c2f5576f096ec89189d93b021bae4ce080 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Apr 2013 08:32:01 +0530 Subject: [PATCH 3/8] Fix #1163659 (Wrong filename output in error message when "Guide reference not found") --- src/calibre/ebooks/oeb/reader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 68db089073..d0474fa7e8 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -333,8 +333,8 @@ class OEBReader(object): guide = self.oeb.guide manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): - href = elem.get('href') - path = urlnormalize(urldefrag(href)[0]) + ref_href = elem.get('href') + path = urlnormalize(urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: @@ -342,12 +342,12 @@ class OEBReader(object): corrected_href = href break if corrected_href is None: - self.logger.warn(u'Guide reference %r not found' % href) + self.logger.warn(u'Guide reference %r not found' % ref_href) continue - href = corrected_href + ref_href = corrected_href typ = elem.get('type') if typ not in guide: - guide.add(typ, elem.get('title'), href) + guide.add(typ, elem.get('title'), ref_href) def _find_ncx(self, opf): result = xpath(opf, '/o2:package/o2:spine/@toc') From fe1f2c79259aad4286a7c9194e8ab341c66b495c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Apr 2013 09:58:58 +0530 Subject: [PATCH 4/8] ToC Editor: Allow generating the ToC directly from individual files inside the ebook. Useful for EPUBs that have individual chapters in single files. Fixes #1163520 (Request for new method to generate entries in ToC editor) --- src/calibre/ebooks/oeb/polish/toc.py | 29 ++++++++++++++++++++++++++++ src/calibre/gui2/toc/main.py | 21 +++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/oeb/polish/toc.py b/src/calibre/ebooks/oeb/polish/toc.py index 3a72b837c8..c84dd1b094 100644 --- a/src/calibre/ebooks/oeb/polish/toc.py +++ b/src/calibre/ebooks/oeb/polish/toc.py @@ -262,6 +262,35 @@ def from_links(container): toc.remove(child) return toc +def find_text(node): + LIMIT = 200 + pat = re.compile(r'\s+') + for child in node: + if isinstance(child, etree._Element): + text = xml2text(child).strip() + text = pat.sub(' ', text) + if len(text) < 1: + continue + if len(text) > LIMIT: + # Look for less text in a child of this node, recursively + ntext = find_text(child) + return ntext or (text[:LIMIT] + '...') + else: + return text + +def from_files(container): + toc = TOC() + for spinepath in container.spine_items: + name = container.abspath_to_name(spinepath) + root = container.parsed(name) + body = XPath('//h:body')(root) + if not body: + continue + text = find_text(body[0]) + if text: + toc.add(text, name) + return toc + def add_id(container, name, loc): root = container.parsed(name) body = root.xpath('//*[local-name()="body"]')[0] diff --git a/src/calibre/gui2/toc/main.py b/src/calibre/gui2/toc/main.py index 74886bbf63..7cb4f9b462 100644 --- a/src/calibre/gui2/toc/main.py +++ b/src/calibre/gui2/toc/main.py @@ -18,7 +18,7 @@ from PyQt4.Qt import (QPushButton, QFrame, QVariant, QMenu, QInputDialog, from calibre.ebooks.oeb.polish.container import get_container, AZW3Container from calibre.ebooks.oeb.polish.toc import ( - get_toc, add_id, TOC, commit_toc, from_xpaths, from_links) + get_toc, add_id, TOC, commit_toc, from_xpaths, from_links, from_files) from calibre.gui2 import Application, error_dialog, gprefs from calibre.gui2.progress_indicator import ProgressIndicator from calibre.gui2.toc.location import ItemEdit @@ -126,6 +126,7 @@ class ItemView(QFrame): # {{{ go_to_root = pyqtSignal() create_from_xpath = pyqtSignal(object) create_from_links = pyqtSignal() + create_from_files = pyqtSignal() flatten_toc = pyqtSignal() def __init__(self, parent): @@ -183,6 +184,15 @@ class ItemView(QFrame): # {{{ ))) l.addWidget(b) + self.cfb = b = QPushButton(_('Generate ToC from &files')) + b.clicked.connect(self.create_from_files) + b.setToolTip(textwrap.fill(_( + 'Generate a Table of Contents from individual files in the book.' + ' Each entry in the ToC will point to the start of the file, the' + ' text of the entry will be the "first line" of text from the file.' + ))) + l.addWidget(b) + self.xpb = b = QPushButton(_('Generate ToC from &XPath')) b.clicked.connect(self.create_from_user_xpath) b.setToolTip(textwrap.fill(_( @@ -577,6 +587,7 @@ class TOCView(QWidget): # {{{ i.add_new_item.connect(self.add_new_item) i.create_from_xpath.connect(self.create_from_xpath) i.create_from_links.connect(self.create_from_links) + i.create_from_files.connect(self.create_from_files) i.flatten_item.connect(self.flatten_item) i.flatten_toc.connect(self.flatten_toc) i.go_to_root.connect(self.go_to_root) @@ -778,6 +789,14 @@ class TOCView(QWidget): # {{{ _('No links were found that could be added to the Table of Contents.'), show=True) self.insert_toc_fragment(toc) + def create_from_files(self): + toc = from_files(self.ebook) + if len(toc) == 0: + return error_dialog(self, _('No items found'), + _('No files were found that could be added to the Table of Contents.'), show=True) + self.insert_toc_fragment(toc) + + # }}} class TOCEditor(QDialog): # {{{ From 3823f8da9f2b5c6c8d7f222a6ccf505ac43c0682 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Apr 2013 09:49:14 +0530 Subject: [PATCH 5/8] Update A List Apart --- recipes/list_apart.recipe | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/recipes/list_apart.recipe b/recipes/list_apart.recipe index 35cbaad958..c11956110f 100644 --- a/recipes/list_apart.recipe +++ b/recipes/list_apart.recipe @@ -1,33 +1,23 @@ -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +# vim:fileencoding=UTF-8 +from __future__ import unicode_literals from calibre.web.feeds.news import BasicNewsRecipe class AListApart (BasicNewsRecipe): - __author__ = u'Marc Busqué ' + __author__ = 'Marc Busqué ' __url__ = 'http://www.lamarciana.com' - __version__ = '1.0' + __version__ = '2.0' __license__ = 'GPL v3' - __copyright__ = u'2012, Marc Busqué ' + __copyright__ = '2012, Marc Busqué ' title = u'A List Apart' - description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.' + description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.' language = 'en' tags = 'web development, software' oldest_article = 120 remove_empty_feeds = True - no_stylesheets = True encoding = 'utf8' cover_url = u'http://alistapart.com/pix/alalogo.gif' - keep_only_tags = [ - dict(name='div', attrs={'id': 'content'}) - ] - remove_tags = [ - dict(name='ul', attrs={'id': 'metastuff'}), - dict(name='div', attrs={'class': 'discuss'}), - dict(name='div', attrs={'class': 'discuss'}), - dict(name='div', attrs={'id': 'learnmore'}), - ] - remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] - extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}' + extra_css = u'img {max-width: 100%; display: block; margin: auto;}' feeds = [ - (u'A List Apart', u'http://www.alistapart.com/site/rss'), + (u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'), ] From d65fd352e9ecb56a7e31dda3ff239ba77a91d5ae Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Apr 2013 10:38:11 +0530 Subject: [PATCH 6/8] Update Sing Tao Daily - Hong Kong and Apple Daily - Hong Kong. AM730 by Eddie Lau --- recipes/am730.recipe | 290 ++++++++++++++++++++++++++++ recipes/apple_daily.recipe | 386 ++++++++++++++++++++++++------------- recipes/singtaohk.recipe | 29 ++- 3 files changed, 553 insertions(+), 152 deletions(-) create mode 100644 recipes/am730.recipe diff --git a/recipes/am730.recipe b/recipes/am730.recipe new file mode 100644 index 0000000000..0fac4bea51 --- /dev/null +++ b/recipes/am730.recipe @@ -0,0 +1,290 @@ +# vim:fileencoding=UTF-8 +from __future__ import unicode_literals +__license__ = 'GPL v3' +__copyright__ = '2013, Eddie Lau' +__Date__ = '' +__HiResImg__ = True + +''' +Change Log: +2013/03/30 -- first version +''' + +from calibre import (__appname__, force_unicode, strftime) +from calibre.utils.date import now as nowf +import os, datetime, re +from calibre.web.feeds.recipes import BasicNewsRecipe +from contextlib import nested +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation +from calibre.utils.localization import canonicalize_lang + +class AppleDaily(BasicNewsRecipe): + title = u'AM730' + __author__ = 'Eddie Lau' + publisher = 'AM730' + oldest_article = 1 + max_articles_per_feed = 100 + auto_cleanup = False + language = 'zh' + encoding = 'utf-8' + auto_cleanup = False + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + description = 'http://www.am730.com.hk' + category = 'Chinese, News, Hong Kong' + masthead_url = 'http://www.am730.com.hk/images/logo.jpg' + + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}' + keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}), + dict(name='div', attrs={'class':'thecontent wordsnap'}), + dict(name='a', attrs={'class':'lightboximg'})] + remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}), + dict(name='img', attrs={'src':'/images/am_endmark.gif'})] + + def get_dtlocal(self): + dt_utc = datetime.datetime.utcnow() + # convert UTC to local hk time - at HKT 6am, all news are available + return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24) + + def get_fetchdate(self): + if __Date__ <> '': + return __Date__ + else: + return self.get_dtlocal().strftime("%Y%m%d") + + def get_fetchformatteddate(self): + if __Date__ <> '': + return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] + else: + return self.get_dtlocal().strftime("%Y-%m-%d") + + def get_fetchyear(self): + if __Date__ <> '': + return __Date__[0:4] + else: + return self.get_dtlocal().strftime("%Y") + + def get_fetchmonth(self): + if __Date__ <> '': + return __Date__[4:6] + else: + return self.get_dtlocal().strftime("%m") + + def get_fetchday(self): + if __Date__ <> '': + return __Date__[6:8] + else: + return self.get_dtlocal().strftime("%d") + + # Note: does not work with custom date given by __Date__ + def get_weekday(self): + return self.get_dtlocal().weekday() + + def populate_article_metadata(self, article, soup, first): + if first and hasattr(self, 'add_toc_thumbnail'): + picdiv = soup.find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,picdiv['src']) + + def parse_index(self): + feeds = [] + soup = self.index_to_soup('http://www.am730.com.hk/') + ul = soup.find(attrs={'class':'nav-section'}) + sectionList = [] + for li in ul.findAll('li'): + a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False) + title = li.find('a').get('title', False).strip() + sectionList.append((title, a)) + for title, url in sectionList: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + return feeds + + def parse_section(self, url): + soup = self.index_to_soup(url) + items = soup.findAll(attrs={'style':'padding-bottom: 15px;'}) + current_articles = [] + for item in items: + a = item.find(attrs={'class':'t6 f14'}).find('a', href=True) + articlelink = 'http://www.am730.com.hk/' + a.get('href', True) + title = self.tag_to_string(a) + description = self.tag_to_string(item.find(attrs={'class':'t3 f14'})) + current_articles.append({'title': title, 'url': articlelink, 'description': description}) + return current_articles + + def preprocess_html(self, soup): + multia = soup.findAll('a') + for a in multia: + if not (a == None): + image = a.find('img') + if not (image == None): + if __HiResImg__: + image['src'] = image.get('src').replace('/thumbs/', '/') + caption = image.get('alt') + tag = Tag(soup, "photo", []) + tag2 = Tag(soup, "photocaption", []) + tag.insert(0, image) + if not caption == None: + tag2.insert(0, caption) + tag.insert(1, tag2) + a.replaceWith(tag) + return soup + + def create_opf(self, feeds, dir=None): + if dir is None: + dir = self.output_dir + title = self.short_title() + if self.output_profile.periodical_date_in_title: + title += strftime(self.timefmt) + mi = MetaInformation(title, [__appname__]) + mi.publisher = __appname__ + mi.author_sort = __appname__ + if self.publication_type: + mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + mi.timestamp = nowf() + article_titles, aseen = [], set() + for f in feeds: + for a in f: + if a.title and a.title not in aseen: + aseen.add(a.title) + article_titles.append(force_unicode(a.title, 'utf-8')) + + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + + '\n\n'.join(article_titles)) + + language = canonicalize_lang(self.language) + if language is not None: + mi.language = language + # This one affects the pub date shown in kindle title + #mi.pubdate = nowf() + # now appears to need the time field to be > 12.00noon as well + mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') + + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) + + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) + + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) + + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) + + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' + + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} + + + def feed_index(num, parent): + f = feeds[num] + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None + desc = a.text_summary + if not desc: + desc = None + else: + desc = self.description_limiter(desc) + tt = a.toc_thumbnail if a.toc_thumbnail else None + entries.append('%sindex.html'%adir) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, + a.title if a.title else _('Untitled Article'), + play_order=po, author=auth, + description=desc, toc_thumbnail=tt) + last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + for sp in a.sub_pages: + prefix = os.path.commonprefix([opf_path, sp]) + relp = sp[len(prefix):] + entries.append(relp.replace(os.sep, '/')) + last = sp + + if os.path.exists(last): + with open(last, 'rb') as fi: + src = fi.read().decode('utf-8') + soup = BeautifulSoup(src) + body = soup.find('body') + if body is not None: + prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) + with open(last, 'wb') as fi: + fi.write(unicode(soup).encode('utf-8')) + if len(feeds) == 0: + raise Exception('All feeds are empty, aborting.') + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None + desc = getattr(f, 'description', None) + if not desc: + desc = None + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, + f.title, play_order=po, description=desc, author=auth)) + + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) + + for i, p in enumerate(entries): + entries[i] = os.path.join(dir, p.replace('/', os.sep)) + opf.create_spine(entries) + opf.set_toc(toc) + + with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): + opf.render(opf_file, ncx_file) + diff --git a/recipes/apple_daily.recipe b/recipes/apple_daily.recipe index 763136c9b0..522427ed6a 100644 --- a/recipes/apple_daily.recipe +++ b/recipes/apple_daily.recipe @@ -1,161 +1,275 @@ -# -*- coding: utf-8 -*- -import re +# vim:fileencoding=UTF-8 +from __future__ import unicode_literals +__license__ = 'GPL v3' +__copyright__ = '2013, Eddie Lau' +__Date__ = '' + +from calibre import (__appname__, force_unicode, strftime) +from calibre.utils.date import now as nowf +import os, datetime, re from calibre.web.feeds.recipes import BasicNewsRecipe +from contextlib import nested +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.metadata.opf2 import OPFCreator +from calibre.ebooks.metadata.toc import TOC +from calibre.ebooks.metadata import MetaInformation +from calibre.utils.localization import canonicalize_lang class AppleDaily(BasicNewsRecipe): - - title = u'蘋果日報' - __author__ = u'蘋果日報' - __publisher__ = u'蘋果日報' - description = u'蘋果日報' - masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif' - language = 'zh_TW' - encoding = 'UTF-8' - timefmt = ' [%a, %d %b, %Y]' - needs_subscription = False + title = u'蘋果日報 (香港)' + __author__ = 'Eddie Lau' + publisher = '蘋果日報' + oldest_article = 1 + max_articles_per_feed = 100 + auto_cleanup = False + language = 'zh' + encoding = 'utf-8' + auto_cleanup = False remove_javascript = True - remove_tags_before = dict(name=['ul', 'h1']) - remove_tags_after = dict(name='form') - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), - dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), - dict(name=['script', 'noscript', 'style', 'form'])] + use_embedded_content = False no_stylesheets = True - extra_css = ''' - @font-face {font-family: "uming", serif, sans-serif; src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n - body {margin-right: 8pt; font-family: 'uming', serif;} - h1 {font-family: 'uming', serif, sans-serif} - ''' - #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' + description = 'http://hkm.appledaily.com/' + category = 'Chinese, News, Hong Kong' + masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png' - preprocess_regexps = [ - (re.compile(r'img.php?server=(?P[^&]+)&path=(?P[^&]+).*', re.DOTALL|re.IGNORECASE), - lambda match: 'http://' + match.group('server') + '/' + match.group('path')), - ] + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}' + keep_only_tags = [dict(name='div', attrs={'id':'content-article'})] + remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}), + dict(name='p', attrs={'class':'next'})] + + def get_dtlocal(self): + dt_utc = datetime.datetime.utcnow() + # convert UTC to local hk time - at HKT 6am, all news are available + return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24) + + def get_fetchdate(self): + if __Date__ <> '': + return __Date__ + else: + return self.get_dtlocal().strftime("%Y%m%d") + + def get_fetchformatteddate(self): + if __Date__ <> '': + return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] + else: + return self.get_dtlocal().strftime("%Y-%m-%d") + + def get_fetchyear(self): + if __Date__ <> '': + return __Date__[0:4] + else: + return self.get_dtlocal().strftime("%Y") + + def get_fetchmonth(self): + if __Date__ <> '': + return __Date__[4:6] + else: + return self.get_dtlocal().strftime("%m") + + def get_fetchday(self): + if __Date__ <> '': + return __Date__[6:8] + else: + return self.get_dtlocal().strftime("%d") + + # Note: does not work with custom date given by __Date__ + def get_weekday(self): + return self.get_dtlocal().weekday() def get_cover_url(self): - return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif' - - - #def get_browser(self): - #br = BasicNewsRecipe.get_browser(self) - #if self.username is not None and self.password is not None: - # br.open('http://www.nytimes.com/auth/login') - # br.select_form(name='login') - # br['USERID'] = self.username - # br['PASSWORD'] = self.password - # br.submit() - #return br - - def preprocess_html(self, soup): - #process all the images - for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): - iurl = tag['src'] - #print 'checking image: ' + iurl - - #img\.php?server\=(?P[^&]+)&path=(?P[^&]+) - p = re.compile(r'img\.php\?server=(?P[^&]+)&path=(?P[^&]+)', re.DOTALL|re.IGNORECASE) - - m = p.search(iurl) - - if m is not None: - iurl = 'http://' + m.group('server') + '/' + m.group('path') - #print 'working! new url: ' + iurl - tag['src'] = iurl - #else: - #print 'not good' - - for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): - iurl = tag['href'] - #print 'checking image: ' + iurl - - #img\.php?server\=(?P[^&]+)&path=(?P[^&]+) - p = re.compile(r'img\.php\?server=(?P[^&]+)&path=(?P[^&]+)', re.DOTALL|re.IGNORECASE) - - m = p.search(iurl) - - if m is not None: - iurl = 'http://' + m.group('server') + '/' + m.group('path') - #print 'working! new url: ' + iurl - tag['href'] = iurl - #else: - #print 'not good' - - return soup + soup = self.index_to_soup('http://hkm.appledaily.com/') + cover = soup.find(attrs={'class':'top-news'}).get('src', False) + br = BasicNewsRecipe.get_browser(self) + try: + br.open(cover) + except: + cover = None + return cover + def populate_article_metadata(self, article, soup, first): + if first and hasattr(self, 'add_toc_thumbnail'): + picdiv = soup.find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,picdiv['src']) def parse_index(self): - base = 'http://news.hotpot.hk/fruit' - soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php') + feeds = [] + soup = self.index_to_soup('http://hkm.appledaily.com/') + ul = soup.find(attrs={'class':'menu'}) + sectionList = [] + for li in ul.findAll('li'): + a = 'http://hkm.appledaily.com/' + li.find('a', href=True).get('href', False) + title = li.find('a', text=True).strip() + if not title == u'動新聞': + sectionList.append((title, a)) + for title, url in sectionList: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + return feeds - #def feed_title(div): - # return ''.join(div.findAll(text=True, recursive=False)).strip() + def parse_section(self, url): + soup = self.index_to_soup(url) + ul = soup.find(attrs={'class':'list'}) + current_articles = [] + for li in ul.findAll('li'): + a = li.find('a', href=True) + title = li.find('p', text=True).strip() + if a is not None: + current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)}) + pass + return current_articles - articles = {} - key = None - ans = [] - for div in soup.findAll('li'): - key = div.find(text=True, recursive=True); - #if key == u'豪情': - # continue; + def create_opf(self, feeds, dir=None): + if dir is None: + dir = self.output_dir + title = self.short_title() + if self.output_profile.periodical_date_in_title: + title += strftime(self.timefmt) + mi = MetaInformation(title, [__appname__]) + mi.publisher = __appname__ + mi.author_sort = __appname__ + if self.publication_type: + mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + mi.timestamp = nowf() + article_titles, aseen = [], set() + for f in feeds: + for a in f: + if a.title and a.title not in aseen: + aseen.add(a.title) + article_titles.append(force_unicode(a.title, 'utf-8')) - print 'section=' + key + mi.comments = self.description + if not isinstance(mi.comments, unicode): + mi.comments = mi.comments.decode('utf-8', 'replace') + mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + + '\n\n'.join(article_titles)) - articles[key] = [] + language = canonicalize_lang(self.language) + if language is not None: + mi.language = language + # This one affects the pub date shown in kindle title + #mi.pubdate = nowf() + # now appears to need the time field to be > 12.00noon as well + mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) + opf_path = os.path.join(dir, 'index.opf') + ncx_path = os.path.join(dir, 'index.ncx') - ans.append(key) + opf = OPFCreator(dir, mi) + # Add mastheadImage entry to section + mp = getattr(self, 'masthead_path', None) + if mp is not None and os.access(mp, os.R_OK): + from calibre.ebooks.metadata.opf2 import Guide + ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref.type = 'masthead' + ref.title = 'Masthead Image' + opf.guide.append(ref) - a = div.find('a', href=True) + manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest.append(os.path.join(dir, 'index.html')) + manifest.append(os.path.join(dir, 'index.ncx')) - if not a: - continue + # Get cover + cpath = getattr(self, 'cover_path', None) + if cpath is None: + pf = open(os.path.join(dir, 'cover.jpg'), 'wb') + if self.default_cover(pf): + cpath = pf.name + if cpath is not None and os.access(cpath, os.R_OK): + opf.cover = cpath + manifest.append(cpath) - url = base + '/' + a['href'] - print 'url=' + url + # Get masthead + mpath = getattr(self, 'masthead_path', None) + if mpath is not None and os.access(mpath, os.R_OK): + manifest.append(mpath) - if not articles.has_key(key): - articles[key] = [] - else: - # sub page - subSoup = self.index_to_soup(url) + opf.create_manifest_from_files_in(manifest) + for mani in opf.manifest: + if mani.path.endswith('.ncx'): + mani.id = 'ncx' + if mani.path.endswith('mastheadImage.jpg'): + mani.id = 'masthead-image' - for subDiv in subSoup.findAll('li'): - subA = subDiv.find('a', href=True) - subTitle = subDiv.find(text=True, recursive=True) - subUrl = base + '/' + subA['href'] - - print 'subUrl' + subUrl - - articles[key].append( - dict(title=subTitle, - url=subUrl, - date='', - description='', - content='')) + entries = ['index.html'] + toc = TOC(base_path=dir) + self.play_order_counter = 0 + self.play_order_map = {} -# elif div['class'] in ['story', 'story headline']: -# a = div.find('a', href=True) -# if not a: -# continue -# url = re.sub(r'\?.*', '', a['href']) -# url += '?pagewanted=all' -# title = self.tag_to_string(a, use_alt=True).strip() -# description = '' -# pubdate = strftime('%a, %d %b') -# summary = div.find(True, attrs={'class':'summary'}) -# if summary: -# description = self.tag_to_string(summary, use_alt=False) -# -# feed = key if key is not None else 'Uncategorized' -# if not articles.has_key(feed): -# articles[feed] = [] -# if not 'podcasts' in url: -# articles[feed].append( -# dict(title=title, url=url, date=pubdate, -# description=description, -# content='')) -# ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2}) - ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)] - return ans + def feed_index(num, parent): + f = feeds[num] + for j, a in enumerate(f): + if getattr(a, 'downloaded', False): + adir = 'feed_%d/article_%d/'%(num, j) + auth = a.author + if not auth: + auth = None + desc = a.text_summary + if not desc: + desc = None + else: + desc = self.description_limiter(desc) + tt = a.toc_thumbnail if a.toc_thumbnail else None + entries.append('%sindex.html'%adir) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + parent.add_item('%sindex.html'%adir, None, + a.title if a.title else _('Untitled Article'), + play_order=po, author=auth, + description=desc, toc_thumbnail=tt) + last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + for sp in a.sub_pages: + prefix = os.path.commonprefix([opf_path, sp]) + relp = sp[len(prefix):] + entries.append(relp.replace(os.sep, '/')) + last = sp + if os.path.exists(last): + with open(last, 'rb') as fi: + src = fi.read().decode('utf-8') + soup = BeautifulSoup(src) + body = soup.find('body') + if body is not None: + prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + templ = self.navbar.generate(True, num, j, len(f), + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + body.insert(len(body.contents), elem) + with open(last, 'wb') as fi: + fi.write(unicode(soup).encode('utf-8')) + if len(feeds) == 0: + raise Exception('All feeds are empty, aborting.') + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + po = self.play_order_map.get(entries[-1], None) + if po is None: + self.play_order_counter += 1 + po = self.play_order_counter + auth = getattr(f, 'author', None) + if not auth: + auth = None + desc = getattr(f, 'description', None) + if not desc: + desc = None + feed_index(i, toc.add_item('feed_%d/index.html'%i, None, + f.title, play_order=po, description=desc, author=auth)) + + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) + + for i, p in enumerate(entries): + entries[i] = os.path.join(dir, p.replace('/', os.sep)) + opf.create_spine(entries) + opf.set_toc(toc) + + with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): + opf.render(opf_file, ncx_file) diff --git a/recipes/singtaohk.recipe b/recipes/singtaohk.recipe index d830381731..bb76c335a0 100644 --- a/recipes/singtaohk.recipe +++ b/recipes/singtaohk.recipe @@ -1,30 +1,30 @@ +# vim:fileencoding=UTF-8 +from __future__ import unicode_literals __license__ = 'GPL v3' -__copyright__ = '2011, Eddie Lau' +__copyright__ = '2011-2013, Eddie Lau' # data source: normal, mobile __Source__ = 'mobile' # please replace the following "True" with "False". (Default: True) __MakePeriodical__ = True # Turn below to True if your device supports display of CJK titles (Default: False) -__UseChineseTitle__ = False +__UseChineseTitle__ = True # Set it to False if you want to skip images (Default: True) __KeepImages__ = True # Set it to True if you want to include a summary in Kindle's article view (Default: False) -__IncludeSummary__ = False +__IncludeSummary__ = True # Set it to True if you want thumbnail images in Kindle's article view (Default: True) __IncludeThumbnails__ = True ''' Change Log: +2013/03/31 -- fix cover retrieval code and heading size, and remove   in summary 2011/12/29 -- first version done -TODO: -* use alternative source at http://m.singtao.com/index.php ''' from calibre.utils.date import now as nowf import os, datetime, re -from datetime import date from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -41,7 +41,7 @@ class STHKRecipe(BasicNewsRecipe): title = 'Sing Tao Daily - Hong Kong' description = 'Hong Kong Chinese Newspaper (http://singtao.com)' category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}' masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png' if __Source__ == 'normal': keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})] @@ -96,17 +96,13 @@ class STHKRecipe(BasicNewsRecipe): return self.get_dtlocal().strftime("%d") def get_cover_url(self): - #cover = 'http://singtao.com/media/a/a(2660).jpg' # for 2011/12/29 - base = 2660 - todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday())) - diff = todaydate - date(2011, 12, 29) - base = base + int(diff.total_seconds()/(3600*24)) - cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg' + soup = self.index_to_soup('http://m.singtao.com/') + cover = soup.find(attrs={'class':'special'}).get('src', False) br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: - cover = 'http://singtao.com/images/stlogo.gif' + cover = None return cover def parse_index(self): @@ -289,11 +285,11 @@ class STHKRecipe(BasicNewsRecipe): # the text may or may not be enclosed in

tag paras = articlebody.findAll('p') if not paras: - paras = articlebody + paras = articlebody textFound = False for p in paras: if not textFound: - summary_candidate = self.tag_to_string(p).strip() + summary_candidate = self.tag_to_string(p).strip().replace(' ', '') if len(summary_candidate) > 0: summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1) article.summary = article.text_summary = summary_candidate @@ -489,3 +485,4 @@ class STHKRecipe(BasicNewsRecipe): + From b1cc151ed781ef549b2a5c71cc86ba92c312595e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Apr 2013 12:38:29 +0530 Subject: [PATCH 7/8] Add libimobiledevice to linux builds --- setup/installer/linux/freeze2.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup/installer/linux/freeze2.py b/setup/installer/linux/freeze2.py index cbf0363fc9..44b8e81bff 100644 --- a/setup/installer/linux/freeze2.py +++ b/setup/installer/linux/freeze2.py @@ -47,6 +47,10 @@ binary_includes = [ '/usr/lib/libgthread-2.0.so.0', '/usr/lib/libpng14.so.14', '/usr/lib/libexslt.so.0', + # Ensure that libimobiledevice is compiled against openssl, not gnutls + '/usr/lib/libimobiledevice.so.3', + '/usr/lib/libusbmuxd.so.2', + '/usr/lib/libplist.so.1', MAGICK_PREFIX+'/lib/libMagickWand.so.5', MAGICK_PREFIX+'/lib/libMagickCore.so.5', '/usr/lib/libgcrypt.so.11', From 2d4746a39d33cc646125576bf4ac8b6e50179194 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Apr 2013 12:52:49 +0530 Subject: [PATCH 8/8] Add libimobiledevice to OS X build --- setup/installer/osx/app/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup/installer/osx/app/main.py b/setup/installer/osx/app/main.py index 345b75f56f..2182038088 100644 --- a/setup/installer/osx/app/main.py +++ b/setup/installer/osx/app/main.py @@ -399,7 +399,8 @@ class Py2App(object): @flush def add_fontconfig(self): info('\nAdding fontconfig') - for x in ('fontconfig.1', 'freetype.6', 'expat.1'): + for x in ('fontconfig.1', 'freetype.6', 'expat.1', + 'plist.1', 'usbmuxd.2', 'imobiledevice.3'): src = os.path.join(SW, 'lib', 'lib'+x+'.dylib') self.install_dylib(src) dst = os.path.join(self.resources_dir, 'fonts')