diff --git a/resources/recipes/bbc.recipe b/resources/recipes/bbc.recipe index 3634769d85..46be17a9e7 100644 --- a/resources/recipes/bbc.recipe +++ b/resources/recipes/bbc.recipe @@ -1,38 +1,47 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' +__copyright__ = '2010, Darko Miletic ' ''' -bbc.co.uk +news.bbc.co.uk ''' -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.recipes import BasicNewsRecipe class BBC(BasicNewsRecipe): - title = u'The BBC' - __author__ = 'Kovid Goyal ans Sujata Raman' - description = 'Global news and current affairs from the British Broadcasting Corporation' - language = 'en' + title = 'The BBC' + __author__ = 'Darko Miletic' + description = 'Global news and current affairs from the British Broadcasting Corporation' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'utf8' + publisher = 'BBC' + category = 'news, UK, world' + language = 'en_GB' + publication_type = 'newsportal' + extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] - no_stylesheets = True - remove_tags = [dict(name='div', attrs={'class':'footer'}), - {'id' : ['popstory','blq-footer']}, - {'class' : ['arrup','links','relatedbbcsites','arr','promobottombg','bbccom_visibility_hidden', 'sharesb', 'sib606', 'mvtb', 'storyextra', 'sidebar1', 'bbccom_text','promotopbg', 'gppromo','promotopbg','bbccom_display_none']}, - ] + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } - keep_only_tags = [dict(name='div', attrs={'class':'mainwrapper'})] - - extra_css = ''' - body{font-family:Arial,Helvetica,sans-serif; font-size:small; align:left} - h1{font-size:large;} - .sh{font-size:large; font-weight:bold} - .cap{font-size:xx-small; } - .lu{font-size:xx-small; } - .ds{font-size:xx-small; } - .mvb{font-size:xx-small;} - .by1{font-size:x-small; color:#666666} - .byd{font-size:x-small;} - ''' + keep_only_tags = [ + dict(attrs={'id' :['meta-information','story-body']}) + ,dict(attrs={'class':['mxb' ,'storybody' ]}) + ] + remove_tags = [ + dict(name=['object','link','table']) + ,dict(attrs={'class':['caption','caption full-width','story-actions','hidden','sharesb','audioInStoryC']}) + ] + remove_tags_after = dict(attrs={'class':'sharesb'}) + remove_attributes = ['width','height'] feeds = [ ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), @@ -50,22 +59,3 @@ class BBC(BasicNewsRecipe): ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), ] - def postprocess_html(self, soup, first): - - for tag in soup.findAll(name= 'img', alt=""): - tag.extract() - - for item in soup.findAll(align = "right"): - del item['align'] - - for tag in soup.findAll(name=['table', 'tr', 'td']): - tag.name = 'div' - - return soup - - - - # def print_version(self, url): - # return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/') - - diff --git a/resources/recipes/bbc_fast.recipe b/resources/recipes/bbc_fast.recipe index 12ae9ce1eb..1af3bf8d1f 100644 --- a/resources/recipes/bbc_fast.recipe +++ b/resources/recipes/bbc_fast.recipe @@ -3,7 +3,7 @@ __copyright__ = '2010, Darko Miletic ' ''' news.bbc.co.uk ''' - +import re from calibre.web.feeds.recipes import BasicNewsRecipe class BBC(BasicNewsRecipe): @@ -18,22 +18,28 @@ class BBC(BasicNewsRecipe): encoding = 'utf8' publisher = 'BBC' category = 'news, UK, world' - language = 'en' - extra_css = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } ' - + language = 'en_GB' + publication_type = 'newsportal' + extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] conversion_options = { 'comments' : description ,'tags' : category ,'language' : language ,'publisher' : publisher + ,'linearize_tables': True } - remove_tags_before = dict(name='div',attrs={'class':'headline'}) - remove_tags_after = dict(name='div', attrs={'class':'footer'}) - remove_tags = [ - dict(name=['object','link','script','iframe']) - ,dict(name='div', attrs={'class':'footer'}) + keep_only_tags = [ + dict(attrs={'id' :['meta-information','story-body']}) + ,dict(attrs={'class':['mxb' ,'storybody' ]}) ] + remove_tags = [ + dict(name=['object','link','table','img']) + ,dict(attrs={'class':['caption','caption full-width','story-actions','hidden','sharesb','audioInStoryC']}) + ] + remove_tags_after = dict(attrs={'class':'sharesb'}) + remove_attributes = ['width','height'] feeds = [ ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), @@ -51,10 +57,3 @@ class BBC(BasicNewsRecipe): ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), ] - def print_version(self, url): - emp,sep,rstrip = url.partition('http://') - return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip - - def get_article_url(self, article): - return article.get('guid', None) - diff --git a/resources/recipes/las_vegas_review.recipe b/resources/recipes/las_vegas_review.recipe new file mode 100644 index 0000000000..9292c105a4 --- /dev/null +++ b/resources/recipes/las_vegas_review.recipe @@ -0,0 +1,24 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1274742400(BasicNewsRecipe): + + title = u'Las Vegas Review Journal' + __author__ = 'Joel' + language = 'en' + + oldest_article = 7 + + max_articles_per_feed = 100 + + feeds = [ + (u'News', u'http://www.lvrj.com/news.rss'), + (u'Business', u'http://www.lvrj.com/business.rss'), + (u'Living', u'http://www.lvrj.com/living.rss'), + (u'Opinion', u'http://www.lvrj.com/opinion.rss'), + (u'Neon', u'http://www.lvrj.com/neon.rss'), + (u'Image', u'http://www.lvrj.com/image.rss'), + (u'Home & Garden', u'http://www.lvrj.com/home_and_garden.rss'), + (u'Furniture & Design', u'http://www.lvrj.com/furniture_and_design.rss'), + (u'Drive', u'http://www.lvrj.com/drive.rss'), + (u'Real Estate', u'http://www.lvrj.com/real_estate.rss'), + (u'Sports', u'http://www.lvrj.com/sports.rss')] diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index a991f2b83c..bd429040d4 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -9,14 +9,13 @@ import re import time from calibre import entity_to_unicode from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ -Comment, BeautifulStoneSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment class NYTimes(BasicNewsRecipe): title = 'New York Times Top Stories' __author__ = 'GRiker' - language = 'en' + language = _('English') description = 'Top Stories from the New York Times' # List of sections typically included in Top Stories. Use a keyword from the @@ -257,6 +256,7 @@ class NYTimes(BasicNewsRecipe): # Fetch the outer table table = soup.find('table') previousTable = table + contentTable = None # Find the deepest table containing the stories while True : @@ -388,6 +388,10 @@ class NYTimes(BasicNewsRecipe): return ans def preprocess_html(self, soup): + # Skip ad pages before actual article + skip_tag = soup.find(True, {'name':'skip'}) + if skip_tag is not None: + soup = self.index_to_soup(skip_tag.parent['href']) return self.strip_anchors(soup) def postprocess_html(self,soup, True): diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index a3ef2555f4..86bb3409f2 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -82,6 +82,7 @@ class NYTimes(BasicNewsRecipe): 'articleExtras', 'articleInline', 'blog_sidebar', + 'businessSearchBar', 'cCol', 'entertainmentSearchBar', 'footer', @@ -286,9 +287,14 @@ class NYTimes(BasicNewsRecipe): raw = self.browser.open('http://www.nytimes.com'+content).read() return BeautifulSoup(raw.decode('cp1252', 'replace')) ''' + # Skip ad pages before actual article + skip_tag = soup.find(True, {'name':'skip'}) + if skip_tag is not None: + soup = self.index_to_soup(skip_tag.parent['href']) return self.strip_anchors(soup) def postprocess_html(self,soup, True): + print "\npostprocess_html()\n" if self.one_picture_per_article: # Remove all images after first @@ -411,6 +417,7 @@ class NYTimes(BasicNewsRecipe): return soup def postprocess_book(self, oeb, opts, log) : + print "\npostprocess_book()\n" def extract_byline(href) : # ', re.DOTALL|re.IGNORECASE),lambda match: '')] @@ -38,6 +40,8 @@ class Wired(BasicNewsRecipe): remove_tags = [ dict(name=['object','embed','iframe','link']) ,dict(name='div', attrs={'class':['podcast_storyboard','tweetmeme_button']}) + ,dict(attrs={'id':'ff_bottom_nav'}) + ,dict(name='a',attrs={'href':'http://www.wired.com/app'}) ] remove_attributes = ['height','width'] @@ -72,17 +76,18 @@ class Wired(BasicNewsRecipe): farticles = [] for item in features.findAll('div',attrs={'class':'section'}): divurl = item.find('div',attrs={'class':'feature-header'}) - divdesc = item.find('div',attrs={'class':'feature-text'}) - url = 'http://www.wired.com' + divurl.a['href'] - title = self.tag_to_string(divurl.a) - description = self.tag_to_string(divdesc) - date = strftime(self.timefmt) - farticles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) + if divurl: + divdesc = item.find('div',attrs={'class':'feature-text'}) + url = 'http://www.wired.com' + divurl.a['href'] + title = self.tag_to_string(divurl.a) + description = self.tag_to_string(divdesc) + date = strftime(self.timefmt) + farticles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':description + }) totalfeeds.append(('Featured Articles', farticles)) #department feeds departments = ['rants','start','test','play','found'] diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 6c5bc9e070..b76c2f6f4e 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -444,7 +444,7 @@ from calibre.devices.kindle.driver import KINDLE, KINDLE2, KINDLE_DX from calibre.devices.nook.driver import NOOK from calibre.devices.prs505.driver import PRS505 from calibre.devices.android.driver import ANDROID, S60 -from calibre.devices.nokia.driver import N770, N810 +from calibre.devices.nokia.driver import N770, N810, E71X from calibre.devices.eslick.driver import ESLICK from calibre.devices.nuut2.driver import NUUT2 from calibre.devices.iriver.driver import IRIVER_STORY @@ -453,8 +453,9 @@ from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK from calibre.devices.edge.driver import EDGE from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS from calibre.devices.sne.driver import SNE -from calibre.devices.misc import PALMPRE, KOBO, AVANT +from calibre.devices.misc import PALMPRE, AVANT from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG +from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon from calibre.library.catalog import CSV_XML, EPUB_MOBI @@ -513,6 +514,7 @@ plugins += [ ANDROID, S60, N770, + E71X, N810, COOL_ER, ESLICK, diff --git a/src/calibre/devices/kobo/__init__.py b/src/calibre/devices/kobo/__init__.py new file mode 100644 index 0000000000..0080175bfa --- /dev/null +++ b/src/calibre/devices/kobo/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + + + diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py new file mode 100644 index 0000000000..4b14b2bf8e --- /dev/null +++ b/src/calibre/devices/kobo/driver.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +from calibre.devices.usbms.driver import USBMS + +class KOBO(USBMS): + + name = 'Kobo Reader Device Interface' + gui_name = 'Kobo Reader' + description = _('Communicate with the Kobo Reader') + author = 'Kovid Goyal' + + supported_platforms = ['windows', 'osx', 'linux'] + + # Ordered list of supported formats + FORMATS = ['epub', 'pdf'] + + VENDOR_ID = [0x2237] + PRODUCT_ID = [0x4161] + BCD = [0x0110] + + VENDOR_NAME = 'KOBO_INC' + WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = '.KOBOEREADER' + + EBOOK_DIR_MAIN = '' + SUPPORTS_SUB_DIRS = True + diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py index c7e0356f32..4310c51421 100644 --- a/src/calibre/devices/misc.py +++ b/src/calibre/devices/misc.py @@ -28,27 +28,6 @@ class PALMPRE(USBMS): EBOOK_DIR_MAIN = 'E-books' -class KOBO(USBMS): - - name = 'Kobo Reader Device Interface' - gui_name = 'Kobo Reader' - description = _('Communicate with the Kobo Reader') - author = 'Kovid Goyal' - - supported_platforms = ['windows', 'osx', 'linux'] - - # Ordered list of supported formats - FORMATS = ['epub', 'pdf'] - - VENDOR_ID = [0x2237] - PRODUCT_ID = [0x4161] - BCD = [0x0110] - - VENDOR_NAME = 'KOBO_INC' - WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = '.KOBOEREADER' - - EBOOK_DIR_MAIN = '' - SUPPORTS_SUB_DIRS = True class AVANT(USBMS): name = 'Booq Avant Device Interface' diff --git a/src/calibre/devices/nokia/driver.py b/src/calibre/devices/nokia/driver.py index 59c181a4da..66a4243f2b 100644 --- a/src/calibre/devices/nokia/driver.py +++ b/src/calibre/devices/nokia/driver.py @@ -45,3 +45,25 @@ class N810(N770): WINDOWS_MAIN_MEM = 'N810' MAIN_MEMORY_VOLUME_LABEL = 'N810 Main Memory' + +class E71X(USBMS): + + name = 'Nokia E71X device interface' + gui_name = 'Nokia E71X' + description = 'Communicate with the Nokia E71X' + author = 'Kovid Goyal' + supported_platforms = ['windows', 'linux', 'osx'] + + VENDOR_ID = [0x421] + PRODUCT_ID = [0x1a0] + BCD = [0x100] + + + FORMATS = ['mobi', 'prc'] + + EBOOK_DIR_MAIN = 'eBooks' + SUPPORTS_SUB_DIRS = True + + VENDOR_NAME = 'NOKIA' + WINDOWS_MAIN_MEM = 'S60' + diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index 5c4e255177..214511ae14 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -117,7 +117,7 @@ class EPUBInput(InputFormatPlugin): encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = None for f in walk(u'.'): - if f.lower().endswith('.opf'): + if f.lower().endswith('.opf') and '__MACOSX' not in f: opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') @@ -146,6 +146,10 @@ class EPUBInput(InputFormatPlugin): self.rationalize_cover(opf, log) self.optimize_opf_parsing = opf + for x in opf.itermanifest(): + if x.get('media-type', '') == 'application/x-dtbook+xml': + raise ValueError( + 'EPUB files with DTBook markup are not supported') with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 180b0c1f23..ee779aaefa 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -106,7 +106,7 @@ class EPUBOutput(OutputFormatPlugin): recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)]) - def workaround_webkit_quirks(self): + def workaround_webkit_quirks(self): # {{{ from calibre.ebooks.oeb.base import XPath for x in self.oeb.spine: root = x.data @@ -120,8 +120,9 @@ class EPUBOutput(OutputFormatPlugin): for pre in XPath('//h:pre')(body): if not pre.text and len(pre) == 0: pre.tag = 'div' + # }}} - def upshift_markup(self): + def upshift_markup(self): # {{{ 'Upgrade markup to comply with XHTML 1.1 where possible' from calibre.ebooks.oeb.base import XPath for x in self.oeb.spine: @@ -135,6 +136,7 @@ class EPUBOutput(OutputFormatPlugin): for u in XPath('//h:u')(root): u.tag = 'span' u.set('style', 'text-decoration:underline') + # }}} def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb @@ -161,8 +163,10 @@ class EPUBOutput(OutputFormatPlugin): self.workaround_sony_quirks() if self.oeb.toc.count() == 0: - self.log.warn('This EPUB file has no Table of Contents. It will ' - 'not validate via epubcheck') + self.log.warn('This EPUB file has no Table of Contents. ' + 'Creating a default TOC') + first = iter(self.oeb.spine).next() + self.oeb.toc.add(_('Start'), first.href) from calibre.ebooks.oeb.base import OPF identifiers = oeb.metadata['identifier'] @@ -202,7 +206,7 @@ class EPUBOutput(OutputFormatPlugin): self.log.info('EPUB extracted to', opts.extract_to) epub.close() - def encrypt_fonts(self, uris, tdir, uuid): + def encrypt_fonts(self, uris, tdir, uuid): # {{{ from binascii import unhexlify key = re.sub(r'[^a-fA-F0-9]', '', uuid) @@ -247,6 +251,7 @@ class EPUBOutput(OutputFormatPlugin): ans += (u'\n'.join(fonts)).encode('utf-8') ans += '\n</encryption>' return ans + # }}} def condense_ncx(self, ncx_path): if not self.opts.pretty_print: @@ -259,7 +264,7 @@ class EPUBOutput(OutputFormatPlugin): compressed = etree.tostring(tree.getroot(), encoding='utf-8') open(ncx_path, 'wb').write(compressed) - def workaround_ade_quirks(self): + def workaround_ade_quirks(self): # {{{ ''' Perform various markup transforms to get the output to render correctly in the quirky ADE. @@ -388,8 +393,9 @@ class EPUBOutput(OutputFormatPlugin): else: self.oeb.log.warn('No stylesheet found') + # }}} - def workaround_sony_quirks(self): + def workaround_sony_quirks(self): # {{{ ''' Perform toc link transforms to alleviate slow loading. ''' @@ -436,3 +442,6 @@ class EPUBOutput(OutputFormatPlugin): if self.oeb.toc: simplify_toc_entry(self.oeb.toc) + + # }}} + diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 020cf8d202..7912d26e83 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -21,7 +21,9 @@ from calibre.utils.logging import Log from calibre import guess_type, prints from calibre.ebooks.oeb.transforms.cover import CoverManager -TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace('__ar__', 'none') +TITLEPAGE = CoverManager.SVG_TEMPLATE.decode('utf-8').replace(\ + '__ar__', 'none').replace('__viewbox__', '0 0 600 800' + ).replace('__width__', '600').replace('__height__', '800') def character_count(html): '''