From c024252eb88d16006401ec31cf27ebf36083b0bb Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 2 Mar 2010 21:36:51 -0500 Subject: [PATCH 01/14] Fix bug in #4971: invalid mode. --- src/calibre/ebooks/pdb/pdf/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py index 7ad5776325..3ae9f8ccca 100644 --- a/src/calibre/ebooks/pdb/pdf/reader.py +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -27,7 +27,7 @@ class Reader(FormatReader): self.log.info('Extracting PDF...') with TemporaryFile() as pdf_n: - pdf = open(pdf_n, 'rw+b') + pdf = open(pdf_n, 'rwb') for x in xrange(self.header.section_count()): pdf.write(self.header.section_data(x)) From 68f0f892e4f04dbdf4f8252773babfb9c369b594 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 01:23:22 -0700 Subject: [PATCH 02/14] EPUB to EPUB conversions: Preserve font encryption --- src/calibre/ebooks/epub/input.py | 36 +++++++++------ src/calibre/ebooks/epub/output.py | 68 ++++++++++++++++++++++++++++- src/calibre/ebooks/metadata/opf2.py | 3 ++ 3 files changed, 92 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py index cf903c0a5d..48699521c7 100644 --- a/src/calibre/ebooks/epub/input.py +++ b/src/calibre/ebooks/epub/input.py @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, re, uuid +import os, uuid from itertools import cycle from lxml import etree @@ -19,8 +19,7 @@ class EPUBInput(InputFormatPlugin): recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)]) - @classmethod - def decrypt_font(cls, key, path): + def decrypt_font(self, key, path): raw = open(path, 'rb').read() crypt = raw[:1024] key = cycle(iter(key)) @@ -29,13 +28,18 @@ class EPUBInput(InputFormatPlugin): f.write(decrypt) f.write(raw[1024:]) - @classmethod - def process_encryption(cls, encfile, opf, log): + def process_encryption(self, encfile, opf, log): key = None - m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) - if m: - key = m.group(1) - key = list(map(ord, uuid.UUID(key).bytes)) + for item in opf.identifier_iter(): + scheme = None + for key in item.attrib.keys(): + if key.endswith('scheme'): + scheme = item.get(key) + if (scheme and scheme.lower() == 'uuid') or \ + (item.text and item.text.startswith('urn:uuid:')): + key = str(item.text).rpartition(':')[-1] + key = list(map(ord, uuid.UUID(key).bytes)) + try: root = etree.parse(encfile) for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): @@ -46,7 +50,8 @@ class EPUBInput(InputFormatPlugin): uri = cr.get('URI') path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) if os.path.exists(path): - cls.decrypt_font(key, path) + self._encrypted_font_uris.append(uri) + self.decrypt_font(key, path) return True except: import traceback @@ -115,14 +120,17 @@ class EPUBInput(InputFormatPlugin): if opf is None: raise ValueError('%s is not a valid EPUB file'%path) - if os.path.exists(encfile): - if not self.process_encryption(encfile, opf, log): - raise DRMError(os.path.basename(path)) - opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) + self._encrypted_font_uris = [] + if os.path.exists(encfile): + if not self.process_encryption(encfile, opf, log): + raise DRMError(os.path.basename(path)) + self.encrypted_fonts = self._encrypted_font_uris + + if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1])+'/' for elem in opf.itermanifest(): diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 6e74a748b1..2b27f09664 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -12,8 +12,9 @@ from urllib import unquote from calibre.customize.conversion import OutputFormatPlugin from calibre.ptempfile import TemporaryDirectory from calibre.constants import __appname__, __version__ -from calibre import strftime, guess_type, prepare_string_for_xml +from calibre import strftime, guess_type, prepare_string_for_xml, CurrentDir from calibre.customize.conversion import OptionRecommendation +from calibre.constants import filesystem_encoding from lxml import etree @@ -170,6 +171,19 @@ class EPUBOutput(OutputFormatPlugin): self.workaround_sony_quirks() + from calibre.ebooks.oeb.base import OPF + identifiers = oeb.metadata['identifier'] + uuid = None + for x in identifiers: + if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode(x).startswith('urn:uuid:'): + uuid = unicode(x).split(':')[-1] + break + if uuid is None: + self.log.warn('No UUID identifier found') + from uuid import uuid4 + uuid = str(uuid4()) + oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid) + with TemporaryDirectory('_epub_output') as tdir: from calibre.customize.ui import plugin_for_output_format oeb_output = plugin_for_output_format('oeb') @@ -177,10 +191,16 @@ class EPUBOutput(OutputFormatPlugin): opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\ if x.endswith('.ncx')][0]) + encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) + encryption = None + if encrypted_fonts: + encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid) from calibre.ebooks.epub import initialize_container epub = initialize_container(output_path, os.path.basename(opf)) epub.add_dir(tdir) + if encryption is not None: + epub.writestr('META-INF/encryption.xml', encryption) if opts.extract_to is not None: if os.path.exists(opts.extract_to): shutil.rmtree(opts.extract_to) @@ -189,6 +209,52 @@ class EPUBOutput(OutputFormatPlugin): self.log.info('EPUB extracted to', opts.extract_to) epub.close() + def encrypt_fonts(self, uris, tdir, uuid): + from binascii import unhexlify + + key = re.sub(r'[^a-fA-F0-9]', '', uuid) + if len(key) < 16: + raise ValueError('UUID identifier %r is invalid'%uuid) + key = unhexlify((key + key)[:32]) + key = tuple(map(ord, key)) + paths = [] + with CurrentDir(tdir): + paths = [os.path.join(*x.split('/')) for x in uris] + uris = dict(zip(uris, paths)) + fonts = [] + for uri in list(uris.keys()): + path = uris[uri] + if isinstance(path, unicode): + path = path.encode(filesystem_encoding) + if not os.path.exists(path): + uris.pop(uri) + continue + self.log.debug('Encrypting font:', uri) + with open(path, 'r+b') as f: + data = f.read(1024) + f.seek(0) + for i in range(1024): + f.write(chr(ord(data[i]) ^ key[i%16])) + if not isinstance(uri, unicode): + uri = uri.decode('utf-8') + fonts.append(u''' + + + + + + + '''%(uri.replace('"', '\\"'))) + if fonts: + ans = ''' + ''' + ans += (u'\n'.join(fonts)).encode('utf-8') + ans += '\n' + return ans + def default_cover(self): ''' Create a generic cover for books that dont have a cover diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 5e57b0b515..5cbaf604c4 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -779,6 +779,9 @@ class OPF(object): self.set_text(matches[0], unicode(val)) return property(fget=fget, fset=fset) + def identifier_iter(self): + for item in self.identifier_path(self.metadata): + yield item def guess_cover(self): ''' From 13a9733d42f8dcfc5e585276637ec60888bc63b5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 01:29:14 -0700 Subject: [PATCH 03/14] Astronomy Pic of the Day by Starson17. Fixes #5045 (New Recipe: Astronomy Picture of the Day) --- resources/recipes/apod.recipe | 37 +++++++++ resources/recipes/epicurious.recipe | 116 ++++++++++++++-------------- 2 files changed, 95 insertions(+), 58 deletions(-) create mode 100644 resources/recipes/apod.recipe diff --git a/resources/recipes/apod.recipe b/resources/recipes/apod.recipe new file mode 100644 index 0000000000..01f4ebf391 --- /dev/null +++ b/resources/recipes/apod.recipe @@ -0,0 +1,37 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class APOD(BasicNewsRecipe): + title = u'Astronomy Picture of the Day' + __author__ = 'Starson17' + description = 'Astronomy Pictures' + language = 'en' + use_embedded_content = False + no_stylesheets = True + cover_url = 'http://apod.nasa.gov/apod/image/1003/m78_torregrosa.jpg' + remove_javascript = True + recursions = 0 + oldest_article = 14 + + feeds = [ + (u'Astronomy Picture of the Day', u'http://apod.nasa.gov/apod.rss') + ] + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} + p{font-family:Arial,Helvetica,sans-serif;font-size:small;} + body{font-family:Helvetica,Arial,sans-serif;font-size:small;} + ''' + def postprocess_html(self, soup, first_fetch): + center_tags = soup.findAll(['center']) + p_tags = soup.findAll(['p']) + last_center = center_tags[-1:] + last_center[0].extract() + first_p = p_tags[:1] + for tag in first_p: + tag.extract() + last2_p = p_tags[-2:] + for tag in last2_p: + tag.extract() + return soup + diff --git a/resources/recipes/epicurious.recipe b/resources/recipes/epicurious.recipe index 7d0925a4bb..dc86af73fd 100644 --- a/resources/recipes/epicurious.recipe +++ b/resources/recipes/epicurious.recipe @@ -1,58 +1,58 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2010, Starson17' -''' -www.epicurious.com -''' -import re -from calibre.web.feeds.news import BasicNewsRecipe - -class Epicurious(BasicNewsRecipe): - title = u'Epicurious' - __author__ = 'Starson17' - description = 'Food and Recipes from Epicurious' - cover_url = 'http://up6.podbean.com/image-logos/21849_logo.jpg' - publisher = 'Epicurious' - tags = 'news, food, gourmet, recipes' - language = 'en' - use_embedded_content = False - no_stylesheets = True - remove_javascript = True - recursions = 3 - oldest_article = 14 - max_articles_per_feed = 20 - - keep_only_tags = [dict(name='div', attrs={'class':['mainconsolewrapper','videoheader','content_unit','entry-content','see_more_block']}), - dict(name='div', attrs={'id':['headline','introBlock','ingredients','preparation','articleContent','in_categories_block']}) - ] - - remove_tags = [{'id':['printShoppingList','addnoteLnk','btnUploadVideo','enlarge_image']}, - {'class':['subLnk','sbmWrapper','detail_division','entry-footer','comment-footer']}, - dict(name='div', attrs={'class':['tagged','comments']}) - ] - - remove_tags_after = [dict(name='div', attrs={'class':'entry-content'})] - - feeds = [ - (u'Recipes: Healthy dinner ', u'http://feeds.epicurious.com/healthy_recipes'), - (u'New Recipes ', u'http://feeds.epicurious.com/newrecipes'), - (u'Features ', u'http://feeds.epicurious.com/latestfeatures'), - (u'Blogs ', u'http://feeds.feedburner.com/epicurious/epiblog') - ] - - match_regexps = [ - r'http://www.epicurious.com/.*recipes/.*/views' - ] - - preprocess_regexps = [ - (re.compile(r'/\n', re.DOTALL|re.IGNORECASE), lambda match: '/'), - (re.compile(r'_116.jpg', re.DOTALL|re.IGNORECASE), lambda match: '.jpg'), - (re.compile('
', re.DOTALL|re.IGNORECASE), lambda match: '') - ] - - def postprocess_html(self, soup, first_fetch): - for t in soup.findAll(['table', 'tr', 'td']): - t.name = 'div' - return soup - \ No newline at end of file +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Starson17' +''' +www.epicurious.com +''' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class Epicurious(BasicNewsRecipe): + title = u'Epicurious' + __author__ = 'Starson17' + description = 'Food and Recipes from Epicurious' + cover_url = 'http://up6.podbean.com/image-logos/21849_logo.jpg' + publisher = 'Epicurious' + tags = 'news, food, gourmet, recipes' + language = 'en' + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + recursions = 3 + oldest_article = 14 + max_articles_per_feed = 20 + + keep_only_tags = [dict(name='div', attrs={'class':['mainconsolewrapper','videoheader','content_unit','entry-content','see_more_block']}), + dict(name='div', attrs={'id':['headline','introBlock','ingredients','preparation','articleContent','in_categories_block']}) + ] + + remove_tags = [{'id':['printShoppingList','addnoteLnk','btnUploadVideo','enlarge_image']}, + {'class':['subLnk','sbmWrapper','detail_division','entry-footer','comment-footer']}, + dict(name='div', attrs={'class':['tagged','comments']}) + ] + + remove_tags_after = [dict(name='div', attrs={'class':'entry-content'})] + + feeds = [ + (u'Recipes: Healthy dinner ', u'http://feeds.epicurious.com/healthy_recipes'), + (u'New Recipes ', u'http://feeds.epicurious.com/newrecipes'), + (u'Features ', u'http://feeds.epicurious.com/latestfeatures'), + (u'Blogs ', u'http://feeds.feedburner.com/epicurious/epiblog') + ] + + match_regexps = [ + r'http://www.epicurious.com/.*recipes/.*/views' + ] + + preprocess_regexps = [ + (re.compile(r'/\n', re.DOTALL|re.IGNORECASE), lambda match: '/'), + (re.compile(r'_116.jpg', re.DOTALL|re.IGNORECASE), lambda match: '.jpg'), + (re.compile('
', re.DOTALL|re.IGNORECASE), lambda match: '') + ] + + def postprocess_html(self, soup, first_fetch): + for t in soup.findAll(['table', 'tr', 'td']): + t.name = 'div' + return soup + From 833c54c5d2f4bb9a18a3e9bca3dff6a9cc5361b7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 01:43:38 -0700 Subject: [PATCH 04/14] When decoding NCX toc files, if no encoding is declared and detection has less that 100% confidence, assume UTF-8. Fixes #5039 (Strange behaviour of TOC for one character) --- src/calibre/ebooks/chardet/__init__.py | 8 +++++--- src/calibre/ebooks/metadata/toc.py | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 975ffc1331..25341b120a 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -53,13 +53,15 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } -def force_encoding(raw, verbose): +def force_encoding(raw, verbose, assume_utf8=False): from calibre.constants import preferred_encoding try: chardet = detect(raw) except: chardet = {'encoding':preferred_encoding, 'confidence':0} encoding = chardet['encoding'] + if chardet['confidence'] < 1 and assume_utf8: + encoding = 'utf-8' if chardet['confidence'] < 1 and verbose: print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) if not encoding: @@ -73,7 +75,7 @@ def force_encoding(raw, verbose): def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, - resolve_entities=False): + resolve_entities=False, assume_utf8=False): ''' Force conversion of byte string to unicode. Tries to look for XML/HTML encoding declaration first, if not found uses the chardet library and @@ -95,7 +97,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, encoding = match.group(1) break if encoding is None: - encoding = force_encoding(raw, verbose) + encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8) try: if encoding.lower().strip() == 'macintosh': encoding = 'mac-roman' diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index 770ee905e3..5099b820d0 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -149,7 +149,8 @@ class TOC(list): def read_ncx_toc(self, toc): self.base_path = os.path.dirname(toc) - soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0]) + raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0] + soup = NCXSoup(raw) def process_navpoint(np, dest): play_order = np.get('playOrder', None) @@ -160,7 +161,7 @@ class TOC(list): if nl is not None: text = u'' for txt in nl.findAll(re.compile('text')): - text += ''.join([unicode(s) for s in txt.findAll(text=True)]) + text += u''.join([unicode(s) for s in txt.findAll(text=True)]) content = np.find(re.compile('content')) if content is None or not content.has_key('src') or not txt: return From 9d61fbe0d996cb56699bccca84102319917d4505 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 11:52:12 -0700 Subject: [PATCH 05/14] add function to winutil to check for an active internet connection --- setup/extensions.py | 2 +- src/calibre/utils/windows/winutil.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/setup/extensions.py b/setup/extensions.py index 147fbfff5d..5251737101 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -143,7 +143,7 @@ extensions = [ if iswindows: extensions.append(Extension('winutil', ['calibre/utils/windows/winutil.c'], - libraries=['shell32', 'setupapi'], + libraries=['shell32', 'setupapi', 'wininet'], cflags=['/X'] )) diff --git a/src/calibre/utils/windows/winutil.c b/src/calibre/utils/windows/winutil.c index efd8f1400d..2f176043b2 100644 --- a/src/calibre/utils/windows/winutil.c +++ b/src/calibre/utils/windows/winutil.c @@ -51,11 +51,15 @@ wherever possible in this module. script being run. So to replace sys.argv, you should use `if len(sys.argv) > 1: sys.argv[1:] = winutil.argv()[1-len(sys.argv):]` +.. function:: internet_connected() -> Return True if there is an active + internet connection. + */ #define UNICODE #include +#include #include #include #include @@ -771,6 +775,15 @@ gettmarg(PyObject *args, struct tm *p) return 1; } +static PyObject * +winutil_internet_connected(PyObject *self, PyObject *args) { + DWORD flags; + BOOL ans = InternetGetConnectedState(&flags, 0); + if (ans) Py_RETURN_TRUE; + Py_RETURN_FALSE; +} + + static PyObject * winutil_strftime(PyObject *self, PyObject *args) { @@ -919,6 +932,10 @@ be a unicode string. Returns unicode strings." "eject_drive(drive_letter)\n\nEject a drive. Raises an exception on failure." }, + {"internet_connected", winutil_internet_connected, METH_VARARGS, + "internet_connected()\n\nReturn True if there is an active internet connection" + }, + {NULL, NULL, 0, NULL} }; From 0d0932a4e212f2637e5ebb3296ff6e4fc6807f3a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 13:37:37 -0700 Subject: [PATCH 06/14] Fix #5048 (ARS Technica fails) --- resources/recipes/ars_technica.recipe | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/resources/recipes/ars_technica.recipe b/resources/recipes/ars_technica.recipe index 0bf5a9a3b0..3997ee4645 100644 --- a/resources/recipes/ars_technica.recipe +++ b/resources/recipes/ars_technica.recipe @@ -5,6 +5,7 @@ __copyright__ = '2008-2010, Darko Miletic ' arstechnica.com ''' +import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag @@ -20,7 +21,7 @@ class ArsTechnica2(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' use_embedded_content = False - extra_css = ' body {font-family: sans-serif} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} ' + extra_css = ' body {font-family: Arial,Helvetica,sans-serif} .title{text-align: left} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} ' conversion_options = { 'comments' : description @@ -30,6 +31,10 @@ class ArsTechnica2(BasicNewsRecipe): } + preprocess_regexps = [ + (re.compile(r'
.*?', re.DOTALL|re.IGNORECASE),lambda match: '') + ] keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})] @@ -37,7 +42,7 @@ class ArsTechnica2(BasicNewsRecipe): dict(name=['object','link','embed']) ,dict(name='div', attrs={'class':'read-more-link'}) ] - + remove_attributes=['width','height'] feeds = [ (u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/' ) @@ -90,3 +95,5 @@ class ArsTechnica2(BasicNewsRecipe): return soup + def get_article_url(self, article): + return article.get('guid', None).rpartition('?')[0] From 556d8971d2246c9661138907b962f3cc42178ebf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 18:31:23 -0700 Subject: [PATCH 07/14] Smithsonian Magazine by Krittika Goyal --- resources/recipes/smith.recipe | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 resources/recipes/smith.recipe diff --git a/resources/recipes/smith.recipe b/resources/recipes/smith.recipe new file mode 100644 index 0000000000..e52b2ee709 --- /dev/null +++ b/resources/recipes/smith.recipe @@ -0,0 +1,52 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class SmithsonianMagazine(BasicNewsRecipe): + title = u'Smithsonian Magazine' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 31#days + max_articles_per_feed = 50 + #encoding = 'latin1' + recursions = 1 + match_regexps = ['&page=[2-9]$'] + + remove_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + remove_tags_after = dict(name='p', attrs={'id':'articlePaginationWrapper'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':'article_sidebar_border'}), + dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}), + #dict(name='ul', attrs={'class':'article-tools'}), + dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}), + ] + + + feeds = [ +('History and Archeology', + 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'), +('People and Places', + 'http://feeds.feedburner.com/smithsonianmag/people-places'), +('Science and Nature', + 'http://feeds.feedburner.com/smithsonianmag/science-nature'), +('Arts and Culture', + 'http://feeds.feedburner.com/smithsonianmag/arts-culture'), +('Travel', + 'http://feeds.feedburner.com/smithsonianmag/travel'), +] + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'article-left'}) + #td = heading.findParent(name='td') + #td.extract() + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup + + def postprocess_html(self, soup, first): + for p in soup.findAll(id='articlePaginationWrapper'): p.extract() + if not first: + for div in soup.findAll(id='article-head'): div.extract() + return soup From 136d1e4a192704bed8c7669e845729e5f9c05d73 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 20:46:44 -0700 Subject: [PATCH 08/14] Ebook-viewer: Handle non-ascii CSS files when doing font substituitions --- src/calibre/ebooks/oeb/iterator.py | 12 ++++--- src/calibre/utils/network.py | 54 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 src/calibre/utils/network.py diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index cb62774e8d..87ce8683a9 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -152,13 +152,17 @@ class EbookIterator(object): prints('Substituting font family: %s -> %s'%(bad, good)) return match.group().replace(bad, '"%s"'%good) + from calibre.ebooks.chardet import force_encoding for csspath in css_files: with open(csspath, 'r+b') as f: css = f.read() - css = font_family_pat.sub(prepend_embedded_font, css) - f.seek(0) - f.truncate() - f.write(css) + enc = force_encoding(css, False) + css = css.decode(enc, 'replace') + ncss = font_family_pat.sub(prepend_embedded_font, css) + if ncss != css: + f.seek(0) + f.truncate() + f.write(ncss.encode(enc)) def __enter__(self, processed=False): self.delete_on_exit = [] diff --git a/src/calibre/utils/network.py b/src/calibre/utils/network.py new file mode 100644 index 0000000000..7e840207cf --- /dev/null +++ b/src/calibre/utils/network.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.constants import iswindows, islinux, isfreebsd + +class LinuxNetworkStatus(object): + + def __init__(self): + try: + import dbus + bus = dbus.SystemBus() + proxy = bus.get_object("org.freedesktop.NetworkManager", + "/org/freedesktop/NetworkManager") + self.manager = dbus.Interface(proxy, "org.freedesktop.DBus.Properties") + except: + self.manager = None + + def __call__(self): + if self.manager is None: + return True + try: + connections = self.manager.Get("org.freedesktop.NetworkManager", + "ActiveConnections") + return len(connections) > 0 + except: + return True + +class WindowsNetworkStatus(object): + + def __init__(self): + from calibre.constants import plugins + self.winutil = plugins['winutil'][0] + + def __call__(self): + if self.winutil is None: + return True + return self.winutil.internet_connected() + +class DummyNetworkStatus(object): + + def __call__(self): + return True + +_network_status = WindowsNetworkStatus() if iswindows else \ + LinuxNetworkStatus() if (islinux or isfreebsd) else \ + DummyNetworkStatus() + +def internet_connected(): + return _network_status() From 3f2e08ba67e6507a8634ae177ae0d980efaf0eb8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 21:03:32 -0700 Subject: [PATCH 09/14] News download scheduler: Don't tru to download news when no active internet connection is present (linux/windows only) --- src/calibre/gui2/dialogs/scheduler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/gui2/dialogs/scheduler.py b/src/calibre/gui2/dialogs/scheduler.py index 5aee71d7c6..d11344207f 100644 --- a/src/calibre/gui2/dialogs/scheduler.py +++ b/src/calibre/gui2/dialogs/scheduler.py @@ -18,6 +18,7 @@ from calibre.gui2 import config as gconf, error_dialog from calibre.web.feeds.recipes.model import RecipeModel from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import utcnow +from calibre.utils.network import internet_connected class SchedulerDialog(QDialog, Ui_Dialog): @@ -304,6 +305,8 @@ class Scheduler(QObject): self.download(urn) def download(self, urn): + if not internet_connected(): + return self.lock.lock() doit = urn not in self.download_queue self.lock.unlock() From 9c371377b6ce97ee6b86851d0faf60cf05e09cf6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 21:27:43 -0700 Subject: [PATCH 10/14] calibre-server: Add --pidfile and --daemonize options --- src/calibre/library/server.py | 64 ++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/src/calibre/library/server.py b/src/calibre/library/server.py index 186b9d8578..9d2cba44de 100644 --- a/src/calibre/library/server.py +++ b/src/calibre/library/server.py @@ -20,10 +20,10 @@ try: except ImportError: import Image as PILImage -from calibre.constants import __version__, __appname__ +from calibre.constants import __version__, __appname__, iswindows from calibre.utils.genshi.template import MarkupTemplate from calibre import fit_image, guess_type, prepare_string_for_xml, \ - strftime as _strftime, prints + strftime as _strftime from calibre.library import server_config as config from calibre.library.database2 import LibraryDatabase2, FIELD_MAP from calibre.utils.config import config_dir @@ -423,10 +423,8 @@ class LibraryServer(object): self.opts.port, {'path':'/stanza'}) except: import traceback - print 'Failed to start BonJour:' - cherrypy.log('Failed to start BonJour:') - cherrypy.log(traceback.format_exc()) - traceback.print_exc() + cherrypy.log.error('Failed to start BonJour:') + cherrypy.log.error(traceback.format_exc()) cherrypy.engine.block() except Exception, e: self.exception = e @@ -436,10 +434,8 @@ class LibraryServer(object): stop_zeroconf() except: import traceback - print 'Failed to stop BonJour:' - cherrypy.log('Failed to stop BonJour:') - cherrypy.log(traceback.format_exc()) - traceback.print_exc() + cherrypy.log.error('Failed to stop BonJour:') + cherrypy.log.error(traceback.format_exc()) def exit(self): cherrypy.engine.exit() @@ -472,7 +468,8 @@ class LibraryServer(object): return of.getvalue() except Exception, err: import traceback - traceback.print_exc() + cherrypy.log.error('Failed to generate cover:') + cherrypy.log.error(traceback.print_exc()) raise cherrypy.HTTPError(404, 'Failed to generate cover: %s'%err) def get_format(self, id, format): @@ -813,7 +810,7 @@ class LibraryServer(object): # A better search would be great want_mobile = self.MOBILE_UA.search(ua) is not None if self.opts.develop and not want_mobile: - prints('User agent:', ua) + cherrypy.log('User agent: '+ua) if want_opds: return self.stanza(search=kwargs.get('search', None), sortby=kwargs.get('sortby',None), authorid=kwargs.get('authorid',None), @@ -882,12 +879,55 @@ def option_parser(): parser = config().option_parser('%prog '+ _('[options]\n\nStart the calibre content server.')) parser.add_option('--with-library', default=None, help=_('Path to the library folder to serve with the content server')) + parser.add_option('--pidfile', default=None, + help=_('Write process PID to the specified file')) + parser.add_option('--daemonize', default=False, action='store_true', + help='Run process in background as a daemon. No effect on windows.') return parser +def daemonize(stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): + try: + pid = os.fork() + if pid > 0: + # exit first parent + sys.exit(0) + except OSError, e: + print >>sys.stderr, "fork #1 failed: %d (%s)" % (e.errno, e.strerror) + sys.exit(1) + + # decouple from parent environment + os.chdir("/") + os.setsid() + os.umask(0) + + # do second fork + try: + pid = os.fork() + if pid > 0: + # exit from second parent + sys.exit(0) + except OSError, e: + print >>sys.stderr, "fork #2 failed: %d (%s)" % (e.errno, e.strerror) + sys.exit(1) + + # Redirect standard file descriptors. + si = file(stdin, 'r') + so = file(stdout, 'a+') + se = file(stderr, 'a+', 0) + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + + def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) + if opts.daemonize and not iswindows: + daemonize() + if opts.pidfile is not None: + with open(opts.pidfile, 'wb') as f: + f.write(str(os.getpid())) cherrypy.log.screen = True from calibre.utils.config import prefs if opts.with_library is None: From bf91ca5e9357e954d4a89a8fb644b77671d478ed Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 21:33:57 -0700 Subject: [PATCH 11/14] San Francisco Bay Guardian by Krittika Goyal --- resources/recipes/sfbg.recipe | 42 +++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 resources/recipes/sfbg.recipe diff --git a/resources/recipes/sfbg.recipe b/resources/recipes/sfbg.recipe new file mode 100644 index 0000000000..5530bc7163 --- /dev/null +++ b/resources/recipes/sfbg.recipe @@ -0,0 +1,42 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class SanFranciscoBayGuardian(BasicNewsRecipe): + title = u'San Francisco Bay Guardian' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + #encoding = 'latin1' + + no_stylesheets = True + remove_tags_before = dict(name='div', attrs={'id':'story_header'}) + remove_tags_after = dict(name='div', attrs={'id':'shirttail'}) + remove_tags = [ + dict(name='iframe'), + #dict(name='div', attrs={'class':'related-articles'}), + dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}), + #dict(name='ul', attrs={'class':'article-tools'}), + dict(name='ul', attrs={'id':'story_tabs'}), + ] + + + feeds = [ + ('Cover', 'http://www.newsobserver.com/100/index.rss'), + ('News', 'http://www.newsobserver.com/102/index.rss'), + ('Politics', 'http://www.newsobserver.com/105/index.rss'), + ('Business', 'http://www.newsobserver.com/104/index.rss'), + ('Sports', 'http://www.newsobserver.com/103/index.rss'), + ('College Sports', 'http://www.newsobserver.com/119/index.rss'), + ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'), + ('Editorials', 'http://www.newsobserver.com/158/index.rss')] + + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'story_body'}) + #td = heading.findParent(name='td') + #td.extract() + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup From 46736118bbd4524d0bdab501aecdde34cfa37be4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Mar 2010 11:15:41 -0700 Subject: [PATCH 12/14] ... --- src/calibre/gui2/ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 2a7be33839..140d652f72 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -30,7 +30,7 @@ from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.config import prefs, dynamic from calibre.utils.ipc.server import Server from calibre.gui2 import warning_dialog, choose_files, error_dialog, \ - question_dialog,\ + question_dialog,\ pixmap_to_data, choose_dir, \ Dispatcher, gprefs, \ available_height, \ From 15c842a0478c5fd21dc36ca66c1e47baa0017d37 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Mar 2010 13:02:25 -0700 Subject: [PATCH 13/14] Ignore non integral play orders when reading NCX TOC files --- src/calibre/ebooks/oeb/reader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 139f60d508..9043db97f1 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -331,7 +331,10 @@ class OEBReader(object): id = child.get('id') klass = child.get('class', 'chapter') - po = int(child.get('playOrder', self.oeb.toc.next_play_order())) + try: + po = int(child.get('playOrder', self.oeb.toc.next_play_order())) + except: + po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') From 8ba4e70997d8db23437f94c2482b417749f19333 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 4 Mar 2010 17:39:28 -0700 Subject: [PATCH 14/14] Journal of Hospital Medicine by Krittika Goyal --- resources/recipes/johm.recipe | 87 +++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 resources/recipes/johm.recipe diff --git a/resources/recipes/johm.recipe b/resources/recipes/johm.recipe new file mode 100644 index 0000000000..d488d0d3f0 --- /dev/null +++ b/resources/recipes/johm.recipe @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class JournalofHospitalMedicine(BasicNewsRecipe): + + title = 'Journal of Hospital Medicine' + __author__ = 'Krittika Goyal' + description = 'Medical news' + timefmt = ' [%d %b, %Y]' + needs_subscription = True + + no_stylesheets = True + #remove_tags_before = dict(name='div', attrs={'align':'center'}) + #remove_tags_after = dict(name='ol', attrs={'compact':'COMPACT'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':'subContent'}), + dict(name='div', attrs={'id':['contentFrame']}), + #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or author')"}), + #dict(name='table', attrs={'align':'RIGHT'}), + ] + + + + # TO LOGIN + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open('http://www3.interscience.wiley.com/cgi-bin/home') + br.select_form(name='siteLogin') + br['LoginName'] = self.username + br['Password'] = self.password + response = br.submit() + raw = response.read() + if 'userName = ""' in raw: + raise Exception('Login failed. Check your username and password') + return br + + #TO GET ARTICLE TOC + def johm_get_index(self): + return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home') + + # To parse artice toc + def parse_index(self): + parse_soup = self.johm_get_index() + + div = parse_soup.find(id='contentCell') + + current_section = None + current_articles = [] + feeds = [] + for x in div.findAll(True): + if x.name == 'h4': + # Section heading found + if current_articles and current_section: + feeds.append((current_section, current_articles)) + current_section = self.tag_to_string(x) + current_articles = [] + self.log('\tFound section:', current_section) + if current_section is not None and x.name == 'strong': + title = self.tag_to_string(x) + p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x) + if p is None: + continue + url = p.get('href', False) + if not url or not title: + continue + if url.startswith('/'): + url = 'http://www3.interscience.wiley.com'+url + url = url.replace('/HTMLSTART', '/main.html,ftx_abs') + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + #if url.startswith('/'): + #url = 'http://online.wsj.com'+url + current_articles.append({'title': title, 'url':url, + 'description':'', 'date':''}) + + if current_articles and current_section: + feeds.append((current_section, current_articles)) + + return feeds + + def preprocess_html(self, soup): + for img in soup.findAll('img', src=True): + img['src'] = img['src'].replace('tfig', 'nfig') + return soup + \ No newline at end of file