From c024252eb88d16006401ec31cf27ebf36083b0bb Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 2 Mar 2010 21:36:51 -0500
Subject: [PATCH 01/14] Fix bug in #4971: invalid mode.

---
 src/calibre/ebooks/pdb/pdf/reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 7ad5776325..3ae9f8ccca 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -27,7 +27,7 @@ class Reader(FormatReader):
         self.log.info('Extracting PDF...')
 
         with TemporaryFile() as pdf_n:
-            pdf = open(pdf_n, 'rw+b')
+            pdf = open(pdf_n, 'rwb')
             for x in xrange(self.header.section_count()):
                 pdf.write(self.header.section_data(x))
 

From 68f0f892e4f04dbdf4f8252773babfb9c369b594 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 01:23:22 -0700
Subject: [PATCH 02/14] EPUB to EPUB conversions: Preserve font encryption

---
 src/calibre/ebooks/epub/input.py    | 36 +++++++++------
 src/calibre/ebooks/epub/output.py   | 68 ++++++++++++++++++++++++++++-
 src/calibre/ebooks/metadata/opf2.py |  3 ++
 3 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py
index cf903c0a5d..48699521c7 100644
--- a/src/calibre/ebooks/epub/input.py
+++ b/src/calibre/ebooks/epub/input.py
@@ -3,7 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import os, re, uuid
+import os, uuid
 from itertools import cycle
 
 from lxml import etree
@@ -19,8 +19,7 @@ class EPUBInput(InputFormatPlugin):
 
     recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)])
 
-    @classmethod
-    def decrypt_font(cls, key, path):
+    def decrypt_font(self, key, path):
         raw = open(path, 'rb').read()
         crypt = raw[:1024]
         key = cycle(iter(key))
@@ -29,13 +28,18 @@ class EPUBInput(InputFormatPlugin):
             f.write(decrypt)
             f.write(raw[1024:])
 
-    @classmethod
-    def process_encryption(cls, encfile, opf, log):
+    def process_encryption(self, encfile, opf, log):
         key = None
-        m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
-        if m:
-            key = m.group(1)
-            key = list(map(ord, uuid.UUID(key).bytes))
+        for item in opf.identifier_iter():
+            scheme = None
+            for key in item.attrib.keys():
+                if key.endswith('scheme'):
+                    scheme = item.get(key)
+            if (scheme and scheme.lower() == 'uuid') or \
+                    (item.text and item.text.startswith('urn:uuid:')):
+                key = str(item.text).rpartition(':')[-1]
+                key = list(map(ord, uuid.UUID(key).bytes))
+
         try:
             root = etree.parse(encfile)
             for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
@@ -46,7 +50,8 @@ class EPUBInput(InputFormatPlugin):
                 uri = cr.get('URI')
                 path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
                 if os.path.exists(path):
-                    cls.decrypt_font(key, path)
+                    self._encrypted_font_uris.append(uri)
+                    self.decrypt_font(key, path)
             return True
         except:
             import traceback
@@ -115,14 +120,17 @@ class EPUBInput(InputFormatPlugin):
         if opf is None:
             raise ValueError('%s is not a valid EPUB file'%path)
 
-        if os.path.exists(encfile):
-            if not self.process_encryption(encfile, opf, log):
-                raise DRMError(os.path.basename(path))
-
         opf = os.path.relpath(opf, os.getcwdu())
         parts = os.path.split(opf)
         opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
 
+        self._encrypted_font_uris = []
+        if os.path.exists(encfile):
+            if not self.process_encryption(encfile, opf, log):
+                raise DRMError(os.path.basename(path))
+        self.encrypted_fonts = self._encrypted_font_uris
+
+
         if len(parts) > 1 and parts[0]:
             delta = '/'.join(parts[:-1])+'/'
             for elem in opf.itermanifest():
diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py
index 6e74a748b1..2b27f09664 100644
--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@@ -12,8 +12,9 @@ from urllib import unquote
 from calibre.customize.conversion import OutputFormatPlugin
 from calibre.ptempfile import TemporaryDirectory
 from calibre.constants import __appname__, __version__
-from calibre import strftime, guess_type, prepare_string_for_xml
+from calibre import strftime, guess_type, prepare_string_for_xml, CurrentDir
 from calibre.customize.conversion import OptionRecommendation
+from calibre.constants import filesystem_encoding
 
 from lxml import etree
 
@@ -170,6 +171,19 @@ class EPUBOutput(OutputFormatPlugin):
 
         self.workaround_sony_quirks()
 
+        from calibre.ebooks.oeb.base import OPF
+        identifiers = oeb.metadata['identifier']
+        uuid = None
+        for x in identifiers:
+            if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode(x).startswith('urn:uuid:'):
+                uuid = unicode(x).split(':')[-1]
+                break
+        if uuid is None:
+            self.log.warn('No UUID identifier found')
+            from uuid import uuid4
+            uuid = str(uuid4())
+            oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
+
         with TemporaryDirectory('_epub_output') as tdir:
             from calibre.customize.ui import plugin_for_output_format
             oeb_output = plugin_for_output_format('oeb')
@@ -177,10 +191,16 @@ class EPUBOutput(OutputFormatPlugin):
             opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
             self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\
                     if x.endswith('.ncx')][0])
+            encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
+            encryption = None
+            if encrypted_fonts:
+                encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
 
             from calibre.ebooks.epub import initialize_container
             epub = initialize_container(output_path, os.path.basename(opf))
             epub.add_dir(tdir)
+            if encryption is not None:
+                epub.writestr('META-INF/encryption.xml', encryption)
             if opts.extract_to is not None:
                 if os.path.exists(opts.extract_to):
                     shutil.rmtree(opts.extract_to)
@@ -189,6 +209,52 @@ class EPUBOutput(OutputFormatPlugin):
                 self.log.info('EPUB extracted to', opts.extract_to)
             epub.close()
 
+    def encrypt_fonts(self, uris, tdir, uuid):
+        from binascii import unhexlify
+
+        key = re.sub(r'[^a-fA-F0-9]', '', uuid)
+        if len(key) < 16:
+            raise ValueError('UUID identifier %r is invalid'%uuid)
+        key = unhexlify((key + key)[:32])
+        key = tuple(map(ord, key))
+        paths = []
+        with CurrentDir(tdir):
+            paths = [os.path.join(*x.split('/')) for x in uris]
+            uris = dict(zip(uris, paths))
+            fonts = []
+            for uri in list(uris.keys()):
+                path = uris[uri]
+                if isinstance(path, unicode):
+                    path = path.encode(filesystem_encoding)
+                if not os.path.exists(path):
+                    uris.pop(uri)
+                    continue
+                self.log.debug('Encrypting font:', uri)
+                with open(path, 'r+b') as f:
+                    data = f.read(1024)
+                    f.seek(0)
+                    for i in range(1024):
+                        f.write(chr(ord(data[i]) ^ key[i%16]))
+                if not isinstance(uri, unicode):
+                    uri = uri.decode('utf-8')
+                fonts.append(u'''
+                <enc:EncryptedData>
+                    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
+                    <enc:CipherData>
+                    <enc:CipherReference URI="%s"/>
+                    </enc:CipherData>
+                </enc:EncryptedData>
+                '''%(uri.replace('"', '\\"')))
+            if fonts:
+                    ans = '''<encryption
+                    xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
+                    xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
+                    xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
+                    '''
+                    ans += (u'\n'.join(fonts)).encode('utf-8')
+                    ans += '\n</encryption>'
+                    return ans
+
     def default_cover(self):
         '''
         Create a generic cover for books that dont have a cover
diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index 5e57b0b515..5cbaf604c4 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -779,6 +779,9 @@ class OPF(object):
             self.set_text(matches[0], unicode(val))
         return property(fget=fget, fset=fset)
 
+    def identifier_iter(self):
+        for item in self.identifier_path(self.metadata):
+            yield item
 
     def guess_cover(self):
         '''

From 13a9733d42f8dcfc5e585276637ec60888bc63b5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 01:29:14 -0700
Subject: [PATCH 03/14] Astronomy Pic of the Day by Starson17. Fixes #5045 (New
 Recipe: Astronomy Picture of the Day)

---
 resources/recipes/apod.recipe       |  37 +++++++++
 resources/recipes/epicurious.recipe | 116 ++++++++++++++--------------
 2 files changed, 95 insertions(+), 58 deletions(-)
 create mode 100644 resources/recipes/apod.recipe

diff --git a/resources/recipes/apod.recipe b/resources/recipes/apod.recipe
new file mode 100644
index 0000000000..01f4ebf391
--- /dev/null
+++ b/resources/recipes/apod.recipe
@@ -0,0 +1,37 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class APOD(BasicNewsRecipe):
+    title          = u'Astronomy Picture of the Day'
+    __author__  = 'Starson17'
+    description = 'Astronomy Pictures'
+    language = 'en'
+    use_embedded_content    = False
+    no_stylesheets        = True
+    cover_url     = 'http://apod.nasa.gov/apod/image/1003/m78_torregrosa.jpg'
+    remove_javascript = True
+    recursions = 0
+    oldest_article        = 14
+
+    feeds = [
+             (u'Astronomy Picture of the Day', u'http://apod.nasa.gov/apod.rss')
+             ]
+
+    extra_css = '''
+                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+		'''
+    def postprocess_html(self, soup, first_fetch):
+        center_tags = soup.findAll(['center'])
+        p_tags = soup.findAll(['p'])
+        last_center = center_tags[-1:]
+        last_center[0].extract()
+        first_p = p_tags[:1]
+        for tag in first_p:
+            tag.extract()
+        last2_p = p_tags[-2:]
+        for tag in last2_p:
+            tag.extract()
+        return soup
+
diff --git a/resources/recipes/epicurious.recipe b/resources/recipes/epicurious.recipe
index 7d0925a4bb..dc86af73fd 100644
--- a/resources/recipes/epicurious.recipe
+++ b/resources/recipes/epicurious.recipe
@@ -1,58 +1,58 @@
-#!/usr/bin/env  python
-
-__license__   = 'GPL v3'
-__copyright__ = '2010, Starson17'
-'''
-www.epicurious.com
-'''
-import re
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class Epicurious(BasicNewsRecipe):
-    title          = u'Epicurious'
-    __author__  = 'Starson17'
-    description = 'Food and Recipes from Epicurious'
-    cover_url     = 'http://up6.podbean.com/image-logos/21849_logo.jpg'
-    publisher      = 'Epicurious'
-    tags           = 'news, food, gourmet, recipes'          
-    language = 'en'
-    use_embedded_content    = False
-    no_stylesheets        = True
-    remove_javascript = True
-    recursions = 3
-    oldest_article        = 14
-    max_articles_per_feed = 20
-
-    keep_only_tags = [dict(name='div', attrs={'class':['mainconsolewrapper','videoheader','content_unit','entry-content','see_more_block']}),
-                      dict(name='div', attrs={'id':['headline','introBlock','ingredients','preparation','articleContent','in_categories_block']})
-                           ]
-
-    remove_tags = [{'id':['printShoppingList','addnoteLnk','btnUploadVideo','enlarge_image']},
-                   {'class':['subLnk','sbmWrapper','detail_division','entry-footer','comment-footer']},
-                   dict(name='div', attrs={'class':['tagged','comments']})
-                   ]
-
-    remove_tags_after = [dict(name='div', attrs={'class':'entry-content'})]
-
-    feeds = [
-             (u'Recipes: Healthy dinner ', u'http://feeds.epicurious.com/healthy_recipes'),
-             (u'New Recipes ', u'http://feeds.epicurious.com/newrecipes'),
-             (u'Features ', u'http://feeds.epicurious.com/latestfeatures'),
-             (u'Blogs ', u'http://feeds.feedburner.com/epicurious/epiblog')   
-             ]
-    
-    match_regexps = [
-                     r'http://www.epicurious.com/.*recipes/.*/views'
-                     ]
-
-    preprocess_regexps = [
-        (re.compile(r'/\n', re.DOTALL|re.IGNORECASE), lambda match: '/'),
-        (re.compile(r'_116.jpg', re.DOTALL|re.IGNORECASE), lambda match: '.jpg'),
-        (re.compile('<div class=\"comments\".*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')
-        ]
-
-    def postprocess_html(self, soup, first_fetch):
-        for t in soup.findAll(['table', 'tr', 'td']):
-            t.name = 'div'
-        return soup
-        
\ No newline at end of file
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Starson17'
+'''
+www.epicurious.com
+'''
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Epicurious(BasicNewsRecipe):
+    title          = u'Epicurious'
+    __author__  = 'Starson17'
+    description = 'Food and Recipes from Epicurious'
+    cover_url     = 'http://up6.podbean.com/image-logos/21849_logo.jpg'
+    publisher      = 'Epicurious'
+    tags           = 'news, food, gourmet, recipes'
+    language = 'en'
+    use_embedded_content    = False
+    no_stylesheets        = True
+    remove_javascript = True
+    recursions = 3
+    oldest_article        = 14
+    max_articles_per_feed = 20
+
+    keep_only_tags = [dict(name='div', attrs={'class':['mainconsolewrapper','videoheader','content_unit','entry-content','see_more_block']}),
+                      dict(name='div', attrs={'id':['headline','introBlock','ingredients','preparation','articleContent','in_categories_block']})
+                           ]
+
+    remove_tags = [{'id':['printShoppingList','addnoteLnk','btnUploadVideo','enlarge_image']},
+                   {'class':['subLnk','sbmWrapper','detail_division','entry-footer','comment-footer']},
+                   dict(name='div', attrs={'class':['tagged','comments']})
+                   ]
+
+    remove_tags_after = [dict(name='div', attrs={'class':'entry-content'})]
+
+    feeds = [
+             (u'Recipes: Healthy dinner ', u'http://feeds.epicurious.com/healthy_recipes'),
+             (u'New Recipes ', u'http://feeds.epicurious.com/newrecipes'),
+             (u'Features ', u'http://feeds.epicurious.com/latestfeatures'),
+             (u'Blogs ', u'http://feeds.feedburner.com/epicurious/epiblog')
+             ]
+
+    match_regexps = [
+                     r'http://www.epicurious.com/.*recipes/.*/views'
+                     ]
+
+    preprocess_regexps = [
+        (re.compile(r'/\n', re.DOTALL|re.IGNORECASE), lambda match: '/'),
+        (re.compile(r'_116.jpg', re.DOTALL|re.IGNORECASE), lambda match: '.jpg'),
+        (re.compile('<div class=\"comments\".*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')
+        ]
+
+    def postprocess_html(self, soup, first_fetch):
+        for t in soup.findAll(['table', 'tr', 'td']):
+            t.name = 'div'
+        return soup
+

From 833c54c5d2f4bb9a18a3e9bca3dff6a9cc5361b7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 01:43:38 -0700
Subject: [PATCH 04/14] When decoding NCX toc files, if no encoding is declared
 and detection has less that 100% confidence, assume UTF-8. Fixes #5039
 (Strange behaviour of TOC for one character)

---
 src/calibre/ebooks/chardet/__init__.py | 8 +++++---
 src/calibre/ebooks/metadata/toc.py     | 5 +++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py
index 975ffc1331..25341b120a 100644
--- a/src/calibre/ebooks/chardet/__init__.py
+++ b/src/calibre/ebooks/chardet/__init__.py
@@ -53,13 +53,15 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman",
                         "x-sjis" : "shift-jis" }
 
 
-def force_encoding(raw, verbose):
+def force_encoding(raw, verbose, assume_utf8=False):
     from calibre.constants import preferred_encoding
     try:
         chardet = detect(raw)
     except:
         chardet = {'encoding':preferred_encoding, 'confidence':0}
     encoding = chardet['encoding']
+    if chardet['confidence'] < 1 and assume_utf8:
+        encoding = 'utf-8'
     if chardet['confidence'] < 1 and verbose:
         print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
     if not encoding:
@@ -73,7 +75,7 @@ def force_encoding(raw, verbose):
 
 
 def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
-                   resolve_entities=False):
+                   resolve_entities=False, assume_utf8=False):
     '''
     Force conversion of byte string to unicode. Tries to look for XML/HTML
     encoding declaration first, if not found uses the chardet library and
@@ -95,7 +97,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
                 encoding = match.group(1)
                 break
         if encoding is None:
-            encoding = force_encoding(raw, verbose)
+            encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
         try:
             if encoding.lower().strip() == 'macintosh':
                 encoding = 'mac-roman'
diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py
index 770ee905e3..5099b820d0 100644
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@@ -149,7 +149,8 @@ class TOC(list):
 
     def read_ncx_toc(self, toc):
         self.base_path = os.path.dirname(toc)
-        soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
+        raw  = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0]
+        soup = NCXSoup(raw)
 
         def process_navpoint(np, dest):
             play_order = np.get('playOrder', None)
@@ -160,7 +161,7 @@ class TOC(list):
             if nl is not None:
                 text = u''
                 for txt in nl.findAll(re.compile('text')):
-                    text += ''.join([unicode(s) for s in txt.findAll(text=True)])
+                    text += u''.join([unicode(s) for s in txt.findAll(text=True)])
                 content = np.find(re.compile('content'))
                 if content is None or not content.has_key('src') or not txt:
                     return

From 9d61fbe0d996cb56699bccca84102319917d4505 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 11:52:12 -0700
Subject: [PATCH 05/14] add function to winutil to check for an active internet
 connection

---
 setup/extensions.py                 |  2 +-
 src/calibre/utils/windows/winutil.c | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/setup/extensions.py b/setup/extensions.py
index 147fbfff5d..5251737101 100644
--- a/setup/extensions.py
+++ b/setup/extensions.py
@@ -143,7 +143,7 @@ extensions = [
 if iswindows:
     extensions.append(Extension('winutil',
                 ['calibre/utils/windows/winutil.c'],
-                libraries=['shell32', 'setupapi'],
+                libraries=['shell32', 'setupapi', 'wininet'],
                 cflags=['/X']
                 ))
 
diff --git a/src/calibre/utils/windows/winutil.c b/src/calibre/utils/windows/winutil.c
index efd8f1400d..2f176043b2 100644
--- a/src/calibre/utils/windows/winutil.c
+++ b/src/calibre/utils/windows/winutil.c
@@ -51,11 +51,15 @@ wherever possible in this module.
     script being run. So to replace sys.argv, you should use
     `if len(sys.argv) > 1: sys.argv[1:] = winutil.argv()[1-len(sys.argv):]`
 
+.. function:: internet_connected() -> Return True if there is an active
+   internet connection.
+
 */
 
 
 #define UNICODE
 #include <Windows.h>
+#include <Wininet.h>
 #include <Python.h>
 #include <structseq.h>
 #include <timefuncs.h>
@@ -771,6 +775,15 @@ gettmarg(PyObject *args, struct tm *p)
 	return 1;
 }
 
+static PyObject *
+winutil_internet_connected(PyObject *self, PyObject *args) {
+    DWORD flags;
+    BOOL ans = InternetGetConnectedState(&flags, 0);
+    if (ans) Py_RETURN_TRUE;
+    Py_RETURN_FALSE;
+}
+
+
 static PyObject *
 winutil_strftime(PyObject *self, PyObject *args)
 {
@@ -919,6 +932,10 @@ be a unicode string. Returns unicode strings."
 			"eject_drive(drive_letter)\n\nEject a drive. Raises an exception on failure."
 	},
 
+    {"internet_connected", winutil_internet_connected, METH_VARARGS,
+        "internet_connected()\n\nReturn True if there is an active internet connection"
+    },
+
     {NULL, NULL, 0, NULL}
 };
 

From 0d0932a4e212f2637e5ebb3296ff6e4fc6807f3a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 13:37:37 -0700
Subject: [PATCH 06/14] Fix #5048 (ARS Technica fails)

---
 resources/recipes/ars_technica.recipe | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/resources/recipes/ars_technica.recipe b/resources/recipes/ars_technica.recipe
index 0bf5a9a3b0..3997ee4645 100644
--- a/resources/recipes/ars_technica.recipe
+++ b/resources/recipes/ars_technica.recipe
@@ -5,6 +5,7 @@ __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 arstechnica.com
 '''
 
+import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 
@@ -20,7 +21,7 @@ class ArsTechnica2(BasicNewsRecipe):
     no_stylesheets        = True
     encoding              = 'utf-8'
     use_embedded_content  = False
-    extra_css             = ' body {font-family: sans-serif} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
+    extra_css             = ' body {font-family: Arial,Helvetica,sans-serif} .title{text-align: left} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
 
     conversion_options = {
                              'comments'  : description
@@ -30,6 +31,10 @@ class ArsTechnica2(BasicNewsRecipe):
                          }
 
 
+    preprocess_regexps = [
+                (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
+               ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
+                         ]
 
     keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
 
@@ -37,7 +42,7 @@ class ArsTechnica2(BasicNewsRecipe):
                      dict(name=['object','link','embed'])
                     ,dict(name='div', attrs={'class':'read-more-link'})
                   ]
-
+    remove_attributes=['width','height']
 
     feeds = [
               (u'Infinite Loop (Apple content)'        , u'http://feeds.arstechnica.com/arstechnica/apple/'      )
@@ -90,3 +95,5 @@ class ArsTechnica2(BasicNewsRecipe):
 
         return soup
 
+    def get_article_url(self, article):
+        return article.get('guid',  None).rpartition('?')[0]

From 556d8971d2246c9661138907b962f3cc42178ebf Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 18:31:23 -0700
Subject: [PATCH 07/14] Smithsonian Magazine by Krittika Goyal

---
 resources/recipes/smith.recipe | 52 ++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 resources/recipes/smith.recipe

diff --git a/resources/recipes/smith.recipe b/resources/recipes/smith.recipe
new file mode 100644
index 0000000000..e52b2ee709
--- /dev/null
+++ b/resources/recipes/smith.recipe
@@ -0,0 +1,52 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class SmithsonianMagazine(BasicNewsRecipe):
+    title          = u'Smithsonian Magazine'
+    language       = 'en'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 31#days
+    max_articles_per_feed = 50
+    #encoding = 'latin1'
+    recursions = 1
+    match_regexps = ['&page=[2-9]$']
+
+    remove_stylesheets = True
+    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
+    remove_tags_after  = dict(name='p', attrs={'id':'articlePaginationWrapper'})
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name='div', attrs={'class':'article_sidebar_border'}),
+       dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}),
+       #dict(name='ul', attrs={'class':'article-tools'}),
+       dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
+    ]
+
+
+    feeds          = [
+('History and Archeology',
+ 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
+('People and Places',
+ 'http://feeds.feedburner.com/smithsonianmag/people-places'),
+('Science and Nature',
+ 'http://feeds.feedburner.com/smithsonianmag/science-nature'),
+('Arts and Culture',
+ 'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
+('Travel',
+ 'http://feeds.feedburner.com/smithsonianmag/travel'),
+]
+
+    def preprocess_html(self, soup):
+        story = soup.find(name='div', attrs={'id':'article-left'})
+        #td = heading.findParent(name='td')
+        #td.extract()
+        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
+        body = soup.find(name='body')
+        body.insert(0, story)
+        return soup
+
+    def postprocess_html(self, soup, first):
+        for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
+        if not first:
+             for div in soup.findAll(id='article-head'): div.extract()
+        return soup

From 136d1e4a192704bed8c7669e845729e5f9c05d73 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 20:46:44 -0700
Subject: [PATCH 08/14] Ebook-viewer: Handle non-ascii CSS files when doing
 font substituitions

---
 src/calibre/ebooks/oeb/iterator.py | 12 ++++---
 src/calibre/utils/network.py       | 54 ++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 4 deletions(-)
 create mode 100644 src/calibre/utils/network.py

diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py
index cb62774e8d..87ce8683a9 100644
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@@ -152,13 +152,17 @@ class EbookIterator(object):
                         prints('Substituting font family: %s -> %s'%(bad, good))
                         return match.group().replace(bad, '"%s"'%good)
 
+            from calibre.ebooks.chardet import force_encoding
             for csspath in css_files:
                 with open(csspath, 'r+b') as f:
                     css = f.read()
-                    css = font_family_pat.sub(prepend_embedded_font, css)
-                    f.seek(0)
-                    f.truncate()
-                    f.write(css)
+                    enc = force_encoding(css, False)
+                    css = css.decode(enc, 'replace')
+                    ncss = font_family_pat.sub(prepend_embedded_font, css)
+                    if ncss != css:
+                        f.seek(0)
+                        f.truncate()
+                        f.write(ncss.encode(enc))
 
     def __enter__(self, processed=False):
         self.delete_on_exit = []
diff --git a/src/calibre/utils/network.py b/src/calibre/utils/network.py
new file mode 100644
index 0000000000..7e840207cf
--- /dev/null
+++ b/src/calibre/utils/network.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.constants import iswindows, islinux, isfreebsd
+
+class LinuxNetworkStatus(object):
+
+    def __init__(self):
+        try:
+            import dbus
+            bus = dbus.SystemBus()
+            proxy = bus.get_object("org.freedesktop.NetworkManager",
+                        "/org/freedesktop/NetworkManager")
+            self.manager = dbus.Interface(proxy, "org.freedesktop.DBus.Properties")
+        except:
+            self.manager = None
+
+    def __call__(self):
+        if self.manager is None:
+            return True
+        try:
+            connections = self.manager.Get("org.freedesktop.NetworkManager",
+                        "ActiveConnections")
+            return len(connections) > 0
+        except:
+            return True
+
+class WindowsNetworkStatus(object):
+
+    def __init__(self):
+        from calibre.constants import plugins
+        self.winutil = plugins['winutil'][0]
+
+    def __call__(self):
+        if self.winutil is None:
+            return True
+        return self.winutil.internet_connected()
+
+class DummyNetworkStatus(object):
+
+    def __call__(self):
+        return True
+
+_network_status = WindowsNetworkStatus() if iswindows else \
+        LinuxNetworkStatus() if (islinux or isfreebsd) else \
+        DummyNetworkStatus()
+
+def internet_connected():
+    return _network_status()

From 3f2e08ba67e6507a8634ae177ae0d980efaf0eb8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 21:03:32 -0700
Subject: [PATCH 09/14] News download scheduler: Don't tru to download news
 when no active internet connection is present (linux/windows only)

---
 src/calibre/gui2/dialogs/scheduler.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/calibre/gui2/dialogs/scheduler.py b/src/calibre/gui2/dialogs/scheduler.py
index 5aee71d7c6..d11344207f 100644
--- a/src/calibre/gui2/dialogs/scheduler.py
+++ b/src/calibre/gui2/dialogs/scheduler.py
@@ -18,6 +18,7 @@ from calibre.gui2 import config as gconf, error_dialog
 from calibre.web.feeds.recipes.model import RecipeModel
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.utils.date import utcnow
+from calibre.utils.network import internet_connected
 
 class SchedulerDialog(QDialog, Ui_Dialog):
 
@@ -304,6 +305,8 @@ class Scheduler(QObject):
             self.download(urn)
 
     def download(self, urn):
+        if not internet_connected():
+            return
         self.lock.lock()
         doit = urn not in self.download_queue
         self.lock.unlock()

From 9c371377b6ce97ee6b86851d0faf60cf05e09cf6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 21:27:43 -0700
Subject: [PATCH 10/14] calibre-server: Add --pidfile and --daemonize options

---
 src/calibre/library/server.py | 64 ++++++++++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/src/calibre/library/server.py b/src/calibre/library/server.py
index 186b9d8578..9d2cba44de 100644
--- a/src/calibre/library/server.py
+++ b/src/calibre/library/server.py
@@ -20,10 +20,10 @@ try:
 except ImportError:
     import Image as PILImage
 
-from calibre.constants import __version__, __appname__
+from calibre.constants import __version__, __appname__, iswindows
 from calibre.utils.genshi.template import MarkupTemplate
 from calibre import fit_image, guess_type, prepare_string_for_xml, \
-        strftime as _strftime, prints
+        strftime as _strftime
 from calibre.library import server_config as config
 from calibre.library.database2 import LibraryDatabase2, FIELD_MAP
 from calibre.utils.config import config_dir
@@ -423,10 +423,8 @@ class LibraryServer(object):
                              self.opts.port, {'path':'/stanza'})
             except:
                 import traceback
-                print 'Failed to start BonJour:'
-                cherrypy.log('Failed to start BonJour:')
-                cherrypy.log(traceback.format_exc())
-                traceback.print_exc()
+                cherrypy.log.error('Failed to start BonJour:')
+                cherrypy.log.error(traceback.format_exc())
             cherrypy.engine.block()
         except Exception, e:
             self.exception = e
@@ -436,10 +434,8 @@ class LibraryServer(object):
                 stop_zeroconf()
             except:
                 import traceback
-                print 'Failed to stop BonJour:'
-                cherrypy.log('Failed to stop BonJour:')
-                cherrypy.log(traceback.format_exc())
-                traceback.print_exc()
+                cherrypy.log.error('Failed to stop BonJour:')
+                cherrypy.log.error(traceback.format_exc())
 
     def exit(self):
         cherrypy.engine.exit()
@@ -472,7 +468,8 @@ class LibraryServer(object):
             return of.getvalue()
         except Exception, err:
             import traceback
-            traceback.print_exc()
+            cherrypy.log.error('Failed to generate cover:')
+            cherrypy.log.error(traceback.print_exc())
             raise cherrypy.HTTPError(404, 'Failed to generate cover: %s'%err)
 
     def get_format(self, id, format):
@@ -813,7 +810,7 @@ class LibraryServer(object):
         # A better search would be great
         want_mobile = self.MOBILE_UA.search(ua) is not None
         if self.opts.develop and not want_mobile:
-            prints('User agent:', ua)
+            cherrypy.log('User agent: '+ua)
 
         if want_opds:
             return self.stanza(search=kwargs.get('search', None), sortby=kwargs.get('sortby',None), authorid=kwargs.get('authorid',None),
@@ -882,12 +879,55 @@ def option_parser():
     parser = config().option_parser('%prog '+ _('[options]\n\nStart the calibre content server.'))
     parser.add_option('--with-library', default=None,
             help=_('Path to the library folder to serve with the content server'))
+    parser.add_option('--pidfile', default=None,
+            help=_('Write process PID to the specified file'))
+    parser.add_option('--daemonize', default=False, action='store_true',
+            help='Run process in background as a daemon. No effect on windows.')
     return parser
 
+def daemonize(stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
+    try:
+        pid = os.fork()
+        if pid > 0:
+            # exit first parent
+            sys.exit(0)
+    except OSError, e:
+        print >>sys.stderr, "fork #1 failed: %d (%s)" % (e.errno, e.strerror)
+        sys.exit(1)
+
+    # decouple from parent environment
+    os.chdir("/")
+    os.setsid()
+    os.umask(0)
+
+    # do second fork
+    try:
+        pid = os.fork()
+        if pid > 0:
+            # exit from second parent
+            sys.exit(0)
+    except OSError, e:
+        print >>sys.stderr, "fork #2 failed: %d (%s)" % (e.errno, e.strerror)
+        sys.exit(1)
+
+    # Redirect standard file descriptors.
+    si = file(stdin, 'r')
+    so = file(stdout, 'a+')
+    se = file(stderr, 'a+', 0)
+    os.dup2(si.fileno(), sys.stdin.fileno())
+    os.dup2(so.fileno(), sys.stdout.fileno())
+    os.dup2(se.fileno(), sys.stderr.fileno())
+
+
 
 def main(args=sys.argv):
     parser = option_parser()
     opts, args = parser.parse_args(args)
+    if opts.daemonize and not iswindows:
+        daemonize()
+    if opts.pidfile is not None:
+        with open(opts.pidfile, 'wb') as f:
+            f.write(str(os.getpid()))
     cherrypy.log.screen = True
     from calibre.utils.config import prefs
     if opts.with_library is None:

From bf91ca5e9357e954d4a89a8fb644b77671d478ed Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 3 Mar 2010 21:33:57 -0700
Subject: [PATCH 11/14] San Francisco Bay Guardian by Krittika Goyal

---
 resources/recipes/sfbg.recipe | 42 +++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 resources/recipes/sfbg.recipe

diff --git a/resources/recipes/sfbg.recipe b/resources/recipes/sfbg.recipe
new file mode 100644
index 0000000000..5530bc7163
--- /dev/null
+++ b/resources/recipes/sfbg.recipe
@@ -0,0 +1,42 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class SanFranciscoBayGuardian(BasicNewsRecipe):
+    title          = u'San Francisco Bay Guardian'
+    language       = 'en'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 1 #days
+    max_articles_per_feed = 25
+    #encoding = 'latin1'
+
+    no_stylesheets = True
+    remove_tags_before = dict(name='div', attrs={'id':'story_header'})
+    remove_tags_after  = dict(name='div', attrs={'id':'shirttail'})
+    remove_tags = [
+       dict(name='iframe'),
+       #dict(name='div', attrs={'class':'related-articles'}),
+        dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}),
+       #dict(name='ul', attrs={'class':'article-tools'}),
+       dict(name='ul', attrs={'id':'story_tabs'}),
+    ]
+
+
+    feeds = [
+        ('Cover', 'http://www.newsobserver.com/100/index.rss'),
+        ('News', 'http://www.newsobserver.com/102/index.rss'),
+        ('Politics', 'http://www.newsobserver.com/105/index.rss'),
+        ('Business', 'http://www.newsobserver.com/104/index.rss'),
+        ('Sports', 'http://www.newsobserver.com/103/index.rss'),
+        ('College Sports', 'http://www.newsobserver.com/119/index.rss'),
+        ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
+        ('Editorials', 'http://www.newsobserver.com/158/index.rss')]
+
+
+    def preprocess_html(self, soup):
+        story = soup.find(name='div', attrs={'id':'story_body'})
+        #td = heading.findParent(name='td')
+        #td.extract()
+        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
+        body = soup.find(name='body')
+        body.insert(0, story)
+        return soup

From 46736118bbd4524d0bdab501aecdde34cfa37be4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 4 Mar 2010 11:15:41 -0700
Subject: [PATCH 12/14] ...

---
 src/calibre/gui2/ui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py
index 2a7be33839..140d652f72 100644
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@@ -30,7 +30,7 @@ from calibre.ptempfile import PersistentTemporaryFile
 from calibre.utils.config import prefs, dynamic
 from calibre.utils.ipc.server import Server
 from calibre.gui2 import warning_dialog, choose_files, error_dialog, \
-                            question_dialog,\
+                           question_dialog,\
                            pixmap_to_data, choose_dir, \
                            Dispatcher, gprefs, \
                            available_height, \

From 15c842a0478c5fd21dc36ca66c1e47baa0017d37 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 4 Mar 2010 13:02:25 -0700
Subject: [PATCH 13/14] Ignore non integral play orders when reading NCX TOC
 files

---
 src/calibre/ebooks/oeb/reader.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py
index 139f60d508..9043db97f1 100644
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@@ -331,7 +331,10 @@ class OEBReader(object):
             id = child.get('id')
             klass = child.get('class', 'chapter')
 
-            po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
+            try:
+                po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
+            except:
+                po = self.oeb.toc.next_play_order()
 
             authorElement = xpath(child,
                     'descendant::calibre:meta[@name = "author"]')

From 8ba4e70997d8db23437f94c2482b417749f19333 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 4 Mar 2010 17:39:28 -0700
Subject: [PATCH 14/14] Journal of Hospital Medicine by Krittika Goyal

---
 resources/recipes/johm.recipe | 87 +++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 resources/recipes/johm.recipe

diff --git a/resources/recipes/johm.recipe b/resources/recipes/johm.recipe
new file mode 100644
index 0000000000..d488d0d3f0
--- /dev/null
+++ b/resources/recipes/johm.recipe
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class JournalofHospitalMedicine(BasicNewsRecipe):
+
+    title       = 'Journal of Hospital Medicine'
+    __author__  = 'Krittika Goyal'
+    description = 'Medical news'
+    timefmt = ' [%d %b, %Y]'
+    needs_subscription = True
+
+    no_stylesheets = True
+    #remove_tags_before = dict(name='div', attrs={'align':'center'})
+    #remove_tags_after  = dict(name='ol', attrs={'compact':'COMPACT'})
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name='div', attrs={'class':'subContent'}),
+       dict(name='div', attrs={'id':['contentFrame']}),
+       #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or author')"}),
+       #dict(name='table', attrs={'align':'RIGHT'}),
+    ]
+
+
+
+   # TO LOGIN
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        br.open('http://www3.interscience.wiley.com/cgi-bin/home')
+        br.select_form(name='siteLogin')
+        br['LoginName'] = self.username
+        br['Password'] = self.password
+        response = br.submit()
+        raw = response.read()
+        if 'userName = ""' in raw:
+            raise Exception('Login failed. Check your username and password')
+        return br
+
+    #TO GET ARTICLE TOC
+    def johm_get_index(self):
+            return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home')
+
+    # To parse artice toc
+    def parse_index(self):
+            parse_soup = self.johm_get_index()
+
+            div = parse_soup.find(id='contentCell')
+
+            current_section = None
+            current_articles = []
+            feeds = []
+            for x in div.findAll(True):
+                if x.name == 'h4':
+                    # Section heading found
+                    if current_articles and current_section:
+                        feeds.append((current_section, current_articles))
+                    current_section = self.tag_to_string(x)
+                    current_articles = []
+                    self.log('\tFound section:', current_section)
+                if current_section is not None and x.name == 'strong':
+                    title = self.tag_to_string(x)
+                    p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x)
+                    if p is None:
+                        continue
+                    url = p.get('href', False)
+                    if not url or not title:
+                        continue
+                    if url.startswith('/'):
+                         url = 'http://www3.interscience.wiley.com'+url
+                    url = url.replace('/HTMLSTART', '/main.html,ftx_abs')
+                    self.log('\t\tFound article:', title)
+                    self.log('\t\t\t', url)
+                    #if url.startswith('/'):
+                        #url = 'http://online.wsj.com'+url
+                    current_articles.append({'title': title, 'url':url,
+                        'description':'', 'date':''})
+
+            if current_articles and current_section:
+                feeds.append((current_section, current_articles))
+
+            return feeds
+
+    def preprocess_html(self, soup):
+        for img in soup.findAll('img', src=True):
+            img['src'] = img['src'].replace('tfig', 'nfig')
+        return soup
+        
\ No newline at end of file