From 08834f636a73464366be017cb1074b85be72c2cf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Jul 2013 22:47:57 +0530 Subject: [PATCH 01/16] Update mediapart.fr --- recipes/mediapart.recipe | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index f1e6c87385..a457b713f2 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -1,17 +1,18 @@ __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' +__copyright__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ; 2013, Malah ' ''' Mediapart ''' -__author__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' +__author__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ; 2013, Malah ' +import re from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Mathieu Godlewski, Louis Gesbert' + __author__ = 'Mathieu Godlewski, Louis Gesbert, Malah' description = 'Global news in french from news site Mediapart' oldest_article = 7 language = 'fr' @@ -21,6 +22,7 @@ class Mediapart(BasicNewsRecipe): use_embedded_content = False no_stylesheets = True + masthead_url = 'https://upload.wikimedia.org/wikipedia/fr/2/23/Mediapart.png' cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' feeds = [ @@ -36,18 +38,18 @@ class Mediapart(BasicNewsRecipe): def print_version(self, url): raw = self.browser.open(url).read() soup = BeautifulSoup(raw.decode('utf8', 'replace')) - link = soup.find('a', {'title':'Imprimer'}) + link = soup.find('a', {'href':re.compile('^/print/[0-9]+')}) if link is None: return None - return link['href'] + return 'http://www.mediapart.fr' + link['href'] # -- Handle login def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: - br.open('http://www.mediapart.fr/') - br.select_form(nr=0) + br.open('http://blogs.mediapart.fr/editions/guide-du-coordonnateur-d-edition') + br.select_form(nr=1) br['name'] = self.username br['pass'] = self.password br.submit() From a41a945e8f33a856d055f9c55c3cc34358b0023d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Jul 2013 10:26:50 +0530 Subject: [PATCH 02/16] Add FAQ about windows temp folder permissions --- manual/faq.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/manual/faq.rst b/manual/faq.rst index bdac21a622..64da7cd7ef 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -776,6 +776,29 @@ The only way to find the culprit is to eliminate the programs one by one and see which one is causing the issue. Basically, stop a program, run calibre, check for crashes. If they still happen, stop another program and repeat. + +Using the viewer or doing any conversions results in a permission denied error on windows +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Something on your computer is preventing calibre from accessing its own +temporary files. Most likely the permissions on your Temp folder are incorrect. +Go to the folder file:`C:\\Users\\USERNAME\\AppData\\Local` in Windows +Explorer and then right click on the file:`Temp` folder, select Properties and go to +the Security tab. Make sure that your user account has full control for this +folder. + +Some users have reported that running the following command in an Administrator +Command Prompt fixed their permissions. To get an Administrator Command Prompt +search for cmd.exe in the start menu, then right click on the command prompt +entry and select Run as Administrator:: + icacls "%appdata%\..\Local\Temp" /reset /T + +Alternately, you can run calibre as Administrator, but doing so will cause +some functionality, such as drag and drop to not work. + +Finally, some users have reported that disabling UAC fixes the problem. + + |app| is not starting on OS X? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From a8deb4b1f8dfb768a32b95b1540be32d5d6e871e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Jul 2013 10:30:34 +0530 Subject: [PATCH 03/16] Ignore type errors when sorting device collections Invalid data in the device database on sony readers could cause errors when sorting device collections, ignore those errors. --- src/calibre/devices/usbms/books.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/usbms/books.py b/src/calibre/devices/usbms/books.py index bbbb3938ac..5254a814be 100644 --- a/src/calibre/devices/usbms/books.py +++ b/src/calibre/devices/usbms/books.py @@ -283,11 +283,17 @@ class CollectionsBookList(BookList): return -1 if isinstance(x, basestring) and isinstance(y, basestring): x, y = sort_key(force_unicode(x)), sort_key(force_unicode(y)) - c = cmp(x, y) + try: + c = cmp(x, y) + except TypeError: + c = 0 if c != 0: return c # same as above -- no sort_key needed here - return cmp(xx[2], yy[2]) + try: + return cmp(xx[2], yy[2]) + except TypeError: + return 0 for category, lpaths in collections.items(): books = lpaths.values() From fae8aa1405279b630456880f0689f2eece115154 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Jul 2013 11:32:40 +0530 Subject: [PATCH 04/16] Dont change into temp dir when downloading single covers Works around the problem with temp dir permissions on some windows computers. --- src/calibre/ebooks/metadata/sources/worker.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/worker.py b/src/calibre/ebooks/metadata/sources/worker.py index 51fb883e7d..1c83f965e1 100644 --- a/src/calibre/ebooks/metadata/sources/worker.py +++ b/src/calibre/ebooks/metadata/sources/worker.py @@ -106,7 +106,6 @@ def single_identify(title, authors, identifiers): r in results], dump_caches(), log.dump() def single_covers(title, authors, identifiers, caches, tdir): - os.chdir(tdir) load_caches(caches) log = GUILog() results = Queue() @@ -126,9 +125,9 @@ def single_covers(title, authors, identifiers, caches, tdir): name += '{%d}'%c[plugin.name] c[plugin.name] += 1 name = '%s,,%s,,%s,,%s.cover'%(name, width, height, fmt) - with open(name, 'wb') as f: + with open(os.path.join(tdir, name), 'wb') as f: f.write(data) - os.mkdir(name+'.done') + os.mkdir(os.path.join(tdir, name+'.done')) return log.dump() From 3a9fa00032fd8dca84848fcc979a5f97f8def534 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Jul 2013 12:24:39 +0530 Subject: [PATCH 05/16] Try to automatically fix temp folder permissions on windows --- manual/faq.rst | 4 +++- src/calibre/__init__.py | 12 ++++++++++-- src/calibre/customize/conversion.py | 2 +- src/calibre/ptempfile.py | 13 +++++++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/manual/faq.rst b/manual/faq.rst index 64da7cd7ef..e5a6342cf8 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -790,7 +790,9 @@ folder. Some users have reported that running the following command in an Administrator Command Prompt fixed their permissions. To get an Administrator Command Prompt search for cmd.exe in the start menu, then right click on the command prompt -entry and select Run as Administrator:: +entry and select Run as Administrator. At the command prompt type the following +command and press Enter:: + icacls "%appdata%\..\Local\Temp" /reset /T Alternately, you can run calibre as Administrator, but doing so will cause diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 07ad906247..5d938ecc55 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -436,13 +436,21 @@ def fit_image(width, height, pwidth, pheight): class CurrentDir(object): - def __init__(self, path): + def __init__(self, path, workaround_temp_folder_permissions=False): self.path = path self.cwd = None + self.workaround_temp_folder_permissions = workaround_temp_folder_permissions def __enter__(self, *args): self.cwd = os.getcwdu() - os.chdir(self.path) + try: + os.chdir(self.path) + except OSError: + if not self.workaround_temp_folder_permissions: + raise + from calibre.ptempfile import reset_temp_folder_permissions + reset_temp_folder_permissions() + os.chdir(self.path) return self.cwd def __exit__(self, *args): diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 38ffcef71f..9a7ed0d24c 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -233,7 +233,7 @@ class InputFormatPlugin(Plugin): # In case stdout is broken pass - with CurrentDir(output_dir): + with CurrentDir(output_dir, workaround_temp_folder_permissions=True): for x in os.listdir('.'): shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) diff --git a/src/calibre/ptempfile.py b/src/calibre/ptempfile.py index 96271fbeaf..f3816f766b 100644 --- a/src/calibre/ptempfile.py +++ b/src/calibre/ptempfile.py @@ -34,6 +34,19 @@ def app_prefix(prefix): return '%s_'%__appname__ return '%s_%s_%s'%(__appname__, __version__, prefix) +def reset_temp_folder_permissions(): + # There are some broken windows installs where the permissions for the temp + # folder are set to not be executable, which means chdir() into temp + # folders fails. Try to fix that by resetting the permissions on the temp + # folder. + global _base_dir + if iswindows and _base_dir: + import subprocess + from calibre import prints + parent = os.path.dirname(_base_dir) + retcode = subprocess.Popen(['icacls.exe', parent, '/reset', '/Q', '/T']).wait() + prints('Trying to reset permissions of temp folder', parent, 'return code:', retcode) + def base_dir(): global _base_dir if _base_dir is not None and not os.path.exists(_base_dir): From af3d990264298697d907769e3e1a2ac777aa4921 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Jul 2013 12:52:51 +0530 Subject: [PATCH 06/16] Edelweiss: Workaround broken advanced search Edelweiss metadata download plugin: Workaround for advanced search being broken at the Edelweiss website. --- .../ebooks/metadata/sources/edelweiss.py | 59 +++++++++++++++---- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/edelweiss.py b/src/calibre/ebooks/metadata/sources/edelweiss.py index 27fd296503..fab0b2017d 100644 --- a/src/calibre/ebooks/metadata/sources/edelweiss.py +++ b/src/calibre/ebooks/metadata/sources/edelweiss.py @@ -34,7 +34,7 @@ def astext(node): return etree.tostring(node, method='text', encoding=unicode, with_tail=False).strip() -class Worker(Thread): # {{{ +class Worker(Thread): # {{{ def __init__(self, sku, url, relevance, result_queue, br, timeout, log, plugin): Thread.__init__(self) @@ -154,8 +154,8 @@ class Worker(Thread): # {{{ # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace - #desc = re.sub('\n+', '\n', desc) - #desc = re.sub(' +', ' ', desc) + # desc = re.sub('\n+', '\n', desc) + # desc = re.sub(' +', ' ', desc) # Remove comments desc = re.sub(r'(?s)', '', desc) return sanitize_comments_html(desc) @@ -183,14 +183,14 @@ class Edelweiss(Source): if sku: return 'http://edelweiss.abovethetreeline.com/ProductDetailPage.aspx?sku=%s'%sku - def get_book_url(self, identifiers): # {{{ + def get_book_url(self, identifiers): # {{{ sku = identifiers.get('edelweiss', None) if sku: return 'edelweiss', sku, self._get_book_url(sku) # }}} - def get_cached_cover_url(self, identifiers): # {{{ + def get_cached_cover_url(self, identifiers): # {{{ sku = identifiers.get('edelweiss', None) if not sku: isbn = identifiers.get('isbn', None) @@ -199,7 +199,7 @@ class Edelweiss(Source): return self.cached_identifier_to_cover_url(sku) # }}} - def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ + def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ from urllib import urlencode BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?' params = { @@ -239,9 +239,40 @@ class Edelweiss(Source): params[k] = v.encode('utf-8') return BASE_URL+urlencode(params) + + def create_query2(self, log, title=None, authors=None, identifiers={}): + ''' The edelweiss advanced search appears to be broken, use the keyword search instead, until it is fixed. ''' + from urllib import urlencode + BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?' + params = { + 'group':'search', + 'section':'CatalogOverview', + 'searchType':1, + 'searchOrgID':'', + 'searchCatalogID': '', + 'searchMailingID': '', + 'searchSelect':1, + } + keywords = [] + isbn = check_isbn(identifiers.get('isbn', None)) + if isbn is not None: + keywords.append(isbn) + elif title or authors: + title_tokens = list(self.get_title_tokens(title)) + if title_tokens: + keywords.extend(title_tokens) + author_tokens = self.get_author_tokens(authors, + only_first_author=True) + if author_tokens: + keywords.extend(author_tokens) + if not keywords: + return None + params['keywords'] = (' '.join(keywords)).encode('utf-8') + return BASE_URL+urlencode(params) + # }}} - def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): from urlparse import parse_qs @@ -251,11 +282,12 @@ class Edelweiss(Source): entries = [(book_url, identifiers['edelweiss'])] else: entries = [] - query = self.create_query(log, title=title, authors=authors, + query = self.create_query2(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return + log('Using query URL:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: @@ -270,7 +302,8 @@ class Edelweiss(Source): for entry in CSSSelect('div.listRow div.listRowMain')(root): a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "ProductDetailPage.aspx")]') - if not a: continue + if not a: + continue href = a[0].get('href') prefix, qs = href.partition('?')[0::2] sku = parse_qs(qs).get('sku', None) @@ -288,7 +321,7 @@ class Edelweiss(Source): div = CSSSelect('div.format.attGroup')(entry) text = astext(div[0]).lower() - if 'audio' in text or 'mp3' in text: # Audio-book, ignore + if 'audio' in text or 'mp3' in text: # Audio-book, ignore continue entries.append((self._get_book_url(sku), sku)) @@ -321,7 +354,7 @@ class Edelweiss(Source): # }}} - def download_cover(self, log, result_queue, abort, # {{{ + def download_cover(self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: @@ -381,7 +414,7 @@ if __name__ == '__main__': ), - ( # Pubdate + ( # Pubdate {'title':'The Great Gatsby', 'authors':['F. Scott Fitzgerald']}, [title_test('The great gatsby', exact=True), authors_test(['F. Scott Fitzgerald']), pubdate_test(2004, 9, 29)] @@ -395,3 +428,5 @@ if __name__ == '__main__': test_identify_plugin(Edelweiss.name, tests) + + From c4cb0e445e9d2dc438f878cbe7590ff060b7c9c4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Jul 2013 15:56:09 +0530 Subject: [PATCH 07/16] Update amazon metadata download plugin for website changes Amazon metadata download: Update plugin to deal with the new amazon.com website --- src/calibre/ebooks/metadata/sources/amazon.py | 144 +++++++++++++----- 1 file changed, 104 insertions(+), 40 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index eb9e5a18cc..028bad6922 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -19,6 +19,11 @@ from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase, from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.localization import canonicalize_lang +def CSSSelect(expr): + from cssselect import HTMLTranslator + from lxml.etree import XPath + return XPath(HTMLTranslator().css_to_xpath(expr)) + class Worker(Thread): # Get details {{{ ''' @@ -142,6 +147,8 @@ class Worker(Thread): # Get details {{{ starts-with(text(), "Editora:") or \ starts-with(text(), "出版社:")] ''' + self.publisher_names = {'Publisher', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'} + self.language_xpath = ''' descendant::*[ starts-with(text(), "Language:") \ @@ -153,6 +160,7 @@ class Worker(Thread): # Get details {{{ or starts-with(text(), "言語") \ ] ''' + self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語'} self.ratings_pat = re.compile( r'([0-9.]+) ?(out of|von|su|étoiles sur|つ星のうち|de un máximo de|de) ([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas)){0,1}') @@ -310,36 +318,44 @@ class Worker(Thread): # Get details {{{ self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) - pd = root.xpath(self.pd_xpath) - if pd: - pd = pd[0] - + non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) + if non_hero: + # New style markup try: - isbn = self.parse_isbn(pd) - if isbn: - self.isbn = mi.isbn = isbn + self.parse_new_details(root, mi, non_hero[0]) except: - self.log.exception('Error parsing ISBN for url: %r'%self.url) - - try: - mi.publisher = self.parse_publisher(pd) - except: - self.log.exception('Error parsing publisher for url: %r'%self.url) - - try: - mi.pubdate = self.parse_pubdate(pd) - except: - self.log.exception('Error parsing publish date for url: %r'%self.url) - - try: - lang = self.parse_language(pd) - if lang: - mi.language = lang - except: - self.log.exception('Error parsing language for url: %r'%self.url) - + self.log.exception('Failed to parse new-style book details section') else: - self.log.warning('Failed to find product description for url: %r'%self.url) + pd = root.xpath(self.pd_xpath) + if pd: + pd = pd[0] + + try: + isbn = self.parse_isbn(pd) + if isbn: + self.isbn = mi.isbn = isbn + except: + self.log.exception('Error parsing ISBN for url: %r'%self.url) + + try: + mi.publisher = self.parse_publisher(pd) + except: + self.log.exception('Error parsing publisher for url: %r'%self.url) + + try: + mi.pubdate = self.parse_pubdate(pd) + except: + self.log.exception('Error parsing publish date for url: %r'%self.url) + + try: + lang = self.parse_language(pd) + if lang: + mi.language = lang + except: + self.log.exception('Error parsing language for url: %r'%self.url) + + else: + self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance @@ -359,7 +375,13 @@ class Worker(Thread): # Get details {{{ for l in link: return l.get('href').rpartition('/')[-1] + def totext(self, elem): + return self.tostring(elem, encoding=unicode, method='text').strip() + def parse_title(self, root): + h1 = root.xpath('//h1[@id="title"]') + if h1: + return self.totext(h1[0]) tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0] actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]') if actual_title: @@ -373,6 +395,11 @@ class Worker(Thread): # Get details {{{ return ans def parse_authors(self, root): + matches = CSSSelect('#byline .author .contributorNameID')(root) + if matches: + authors = [self.totext(x) for x in matches] + return [a for a in authors if a] + x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]' aname = root.xpath(x) if not aname: @@ -420,8 +447,8 @@ class Worker(Thread): # Get details {{{ # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace - #desc = re.sub('\n+', '\n', desc) - #desc = re.sub(' +', ' ', desc) + # desc = re.sub('\n+', '\n', desc) + # desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)--This text ref.*?', '', desc) # Remove comments @@ -429,6 +456,17 @@ class Worker(Thread): # Get details {{{ return sanitize_comments_html(desc) def parse_comments(self, root): + ns = CSSSelect('#bookDescription_feature_div noscript')(root) + if ns: + ns = ns[0] + if len(ns) == 0 and ns.text: + import html5lib + # html5lib parsed noscript as CDATA + ns = html5lib.parseFragment('
%s
' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] + else: + ns.tag = 'div' + return self._render_comments(ns) + ans = '' desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') if desc: @@ -472,6 +510,37 @@ class Worker(Thread): # Get details {{{ bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1])) return ('/'.join(parts[:-1]))+'/'+bn + def parse_new_details(self, root, mi, non_hero): + table = non_hero.xpath('descendant::table')[0] + for tr in table.xpath('descendant::tr'): + cells = tr.xpath('descendant::td') + if len(cells) == 2: + name = self.totext(cells[0]) + val = self.totext(cells[1]) + if not val: + continue + if name in self.language_names: + ans = self.lang_map.get(val, None) + if not ans: + ans = canonicalize_lang(val) + if ans: + mi.language = ans + elif name in self.publisher_names: + pub = val.partition(';')[0].partition('(')[0].strip() + if pub: + mi.publisher = pub + date = val.rpartition('(')[-1].replace(')', '').strip() + try: + from calibre.utils.date import parse_only_date + date = self.delocalize_datestr(date) + mi.pubdate = parse_only_date(date, assume_utc=True) + except: + self.log.exception('Failed to parse pubdate: %s' % val) + elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}: + ans = check_isbn(val) + if ans: + self.isbn = mi.isbn = ans + def parse_isbn(self, pd): items = pd.xpath( 'descendant::*[starts-with(text(), "ISBN")]') @@ -721,9 +790,9 @@ class Amazon(Source): def title_ok(title): title = title.lower() - bad = ['bulk pack', '[audiobook]', '[audio cd]'] + bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )'] if self.domain == 'com': - bad.append('(spanish edition)') + bad.extend(['(%s edition)' % x for x in ('spanish', 'german')]) for x in bad: if x in title: return False @@ -901,14 +970,9 @@ if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e # src/calibre/ebooks/metadata/sources/amazon.py from calibre.ebooks.metadata.sources.test import (test_identify_plugin, - isbn_test, title_test, authors_test, comments_test, series_test) + isbn_test, title_test, authors_test, comments_test) com_tests = [ # {{{ - ( # Has a spanish edition - {'title':'11/22/63'}, - [title_test('11/22/63: A Novel', exact=True), authors_test(['Stephen King']),] - ), - ( # + in title and uses id="main-image" for cover {'title':'C++ Concurrency in Action'}, [title_test('C++ Concurrency in Action: Practical Multithreading', @@ -916,11 +980,10 @@ if __name__ == '__main__': # tests {{{ ] ), - ( # Series + ( # noscript description {'identifiers':{'amazon':'0756407117'}}, [title_test( - "Throne of the Crescent Moon", - exact=True), series_test('Crescent Moon Kingdoms', 1), + "Throne of the Crescent Moon"), comments_test('Makhslood'), ] ), @@ -1054,3 +1117,4 @@ if __name__ == '__main__': # tests {{{ # }}} + From a94539c32b4f2d3a359f733ee0a277af72c8c8ff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 14:44:46 +0530 Subject: [PATCH 08/16] jsbrowser: Fix handling of html with non lxml safe chars --- src/calibre/web/fetch/javascript.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/web/fetch/javascript.py b/src/calibre/web/fetch/javascript.py index 6e9ef86ff1..d7dfcf0a6a 100644 --- a/src/calibre/web/fetch/javascript.py +++ b/src/calibre/web/fetch/javascript.py @@ -145,8 +145,11 @@ def download_resources(browser, resource_cache, output_dir): elem.removeFromDocument() def save_html(browser, output_dir, postprocess_html, url, recursion_level): - html = strip_encoding_declarations(browser.html) import html5lib + from calibre.utils.cleantext import clean_xml_chars + html = strip_encoding_declarations(browser.html) + if isinstance(html, unicode): + html = clean_xml_chars(html) root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot() root = postprocess_html(root, url, recursion_level) if root is None: From bba659b852ded2b2e026eb0be58c67b0862cda6f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 14:53:40 +0530 Subject: [PATCH 09/16] Add a check for modern WebKit --- src/calibre/constants.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 99146e206c..5c9ecbc832 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -282,3 +282,8 @@ def get_windows_user_locale_name(): return None return u'_'.join(buf.value.split(u'-')[:2]) +def is_modern_webkit(): + # Check if we are using QtWebKit >= 2.3 + from PyQt4.QtWebKit import qWebKitMajorVersion + return qWebKitMajorVersion() >= 537 + From ed55e76ff4557c4c3558b28b6dcf29c452d9e9e1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 15:45:20 +0530 Subject: [PATCH 10/16] Update cracked.com --- recipes/cracked_com.recipe | 68 +++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/recipes/cracked_com.recipe b/recipes/cracked_com.recipe index 829299ae17..a702f93433 100644 --- a/recipes/cracked_com.recipe +++ b/recipes/cracked_com.recipe @@ -1,63 +1,55 @@ from calibre.web.feeds.news import BasicNewsRecipe -class Cracked(BasicNewsRecipe): - title = u'Cracked.com' - __author__ = 'UnWeave' - language = 'en' - description = "America's Only HumorSite since 1958" - publisher = 'Cracked' - category = 'comedy, lists' - oldest_article = 3 #days - max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'ascii' - remove_javascript = True - use_embedded_content = False - feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ] +class Cracked(BasicNewsRecipe): + title = u'Cracked.com' + __author__ = 'UnWeave' + language = 'en' + description = "America's Only HumorSite since 1958" + publisher = 'Cracked' + category = 'comedy, lists' + oldest_article = 3 # days + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'ascii' + remove_javascript = True + use_embedded_content = False + # auto_cleanup = True + + feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')] conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } - remove_tags_before = dict(id='PrimaryContent') + # remove_tags_before = dict(id='PrimaryContent') - remove_tags_after = dict(name='div', attrs={'class':'shareBar'}) + keep_only_tags = dict(name='article', attrs={ + 'class': 'module article dropShadowBottomCurved'}) - remove_tags = [ dict(name='div', attrs={'class':['social', - 'FacebookLike', - 'shareBar' - ]}), + # remove_tags_after = dict(name='div', attrs={'class':'shareBar'}) - dict(name='div', attrs={'id':['inline-share-buttons', - ]}), - - dict(name='span', attrs={'class':['views', - 'KonaFilter' - ]}), - #dict(name='img'), - ] + remove_tags = [ + dict(name='section', attrs={'class': ['socialTools', 'quickFixModule']})] def appendPage(self, soup, appendTag, position): # Check if article has multiple pages - pageNav = soup.find('nav', attrs={'class':'PaginationContent'}) + pageNav = soup.find('nav', attrs={'class': 'PaginationContent'}) if pageNav: # Check not at last page - nextPage = pageNav.find('a', attrs={'class':'next'}) + nextPage = pageNav.find('a', attrs={'class': 'next'}) if nextPage: nextPageURL = nextPage['href'] nextPageSoup = self.index_to_soup(nextPageURL) # 8th
tag contains article content - nextPageContent = nextPageSoup.findAll('section')[7] + nextPageContent = nextPageSoup.findAll('article')[0] newPosition = len(nextPageContent.contents) - self.appendPage(nextPageSoup,nextPageContent,newPosition) + self.appendPage(nextPageSoup, nextPageContent, newPosition) nextPageContent.extract() pageNav.extract() - appendTag.insert(position,nextPageContent) + appendTag.insert(position, nextPageContent) def preprocess_html(self, soup): self.appendPage(soup, soup.body, 3) return soup + From 26f86ca98741327da50c377a7ec4c09280a81949 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 16:54:11 +0530 Subject: [PATCH 11/16] jsbrowser(): Fix typo causing some images to not be downloaded Fixes images missing in the Time recipe --- src/calibre/web/jsbrowser/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/web/jsbrowser/browser.py b/src/calibre/web/jsbrowser/browser.py index 104347bfb2..387b149bb9 100644 --- a/src/calibre/web/jsbrowser/browser.py +++ b/src/calibre/web/jsbrowser/browser.py @@ -571,7 +571,7 @@ class Browser(QObject, FormsMixin): ans[url] = raw urls.discard(url) - while urls and time.time() - start_time > timeout and self.page.ready_state not in {'complete', 'completed'}: + while urls and time.time() - start_time < timeout and self.page.ready_state not in {'complete', 'completed'}: get_resources() if urls: self.run_for_a_time(0.1) From 03041f925e288b081c5868e163e7d670878ba1fc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 17:33:28 +0530 Subject: [PATCH 12/16] Tweak to restrict list of output formats Add a tweak to restrict the list of output formats available in the conversion dialog. Go to Preferences->Tweaks to change it. --- resources/default_tweaks.py | 7 +++++++ src/calibre/gui2/convert/bulk.py | 15 ++++++--------- src/calibre/gui2/convert/single.py | 22 +++++++++++++++++----- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index a0e8fafd0f..d8e158f842 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -537,3 +537,10 @@ many_libraries = 10 # highlight with this tweak. Set it to 'transparent' to disable highlighting. highlight_virtual_library = 'yellow' +#: Choose available output formats for conversion +# Restrict the list of available output formats in the conversion dialogs. +# For example, if you only want to convert to EPUB and AZW3, change this to +# restrict_output_formats = ['EPUB', 'AZW3']. The default value of None causes +# all available output formats to be present. +restrict_output_formats = None + diff --git a/src/calibre/gui2/convert/bulk.py b/src/calibre/gui2/convert/bulk.py index b1c3de122b..91efc73ca9 100644 --- a/src/calibre/gui2/convert/bulk.py +++ b/src/calibre/gui2/convert/bulk.py @@ -9,8 +9,7 @@ import shutil from PyQt4.Qt import QString, SIGNAL from calibre.gui2.convert.single import (Config, sort_formats_by_preference, - GroupModel, gprefs) -from calibre.customize.ui import available_output_formats + GroupModel, gprefs, get_output_formats) from calibre.gui2 import ResizableDialog from calibre.gui2.convert.look_and_feel import LookAndFeelWidget from calibre.gui2.convert.heuristics import HeuristicsWidget @@ -43,7 +42,6 @@ class BulkConfig(Config): 'values saved in a previous conversion (if they exist) instead ' 'of using the defaults specified in the Preferences')) - self.connect(self.output_formats, SIGNAL('currentIndexChanged(QString)'), self.setup_pipeline) self.connect(self.groups, SIGNAL('activated(QModelIndex)'), @@ -96,7 +94,8 @@ class BulkConfig(Config): while True: c = self.stack.currentWidget() - if not c: break + if not c: + break self.stack.removeWidget(c) widgets = [lf, hw, ps, sd, toc, sr] @@ -118,17 +117,14 @@ class BulkConfig(Config): except: pass - def setup_output_formats(self, db, preferred_output_format): if preferred_output_format: preferred_output_format = preferred_output_format.lower() - output_formats = sorted(available_output_formats(), - key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x)) - output_formats.remove('oeb') + output_formats = get_output_formats(preferred_output_format) preferred_output_format = preferred_output_format if \ preferred_output_format and preferred_output_format \ in output_formats else sort_formats_by_preference(output_formats, - prefs['output_format'])[0] + [prefs['output_format']])[0] self.output_formats.addItems(list(map(QString, [x.upper() for x in output_formats]))) self.output_formats.setCurrentIndex(output_formats.index(preferred_output_format)) @@ -149,3 +145,4 @@ class BulkConfig(Config): bytearray(self.saveGeometry()) return ResizableDialog.done(self, r) + diff --git a/src/calibre/gui2/convert/single.py b/src/calibre/gui2/convert/single.py index 1a915288a8..e8342610dd 100644 --- a/src/calibre/gui2/convert/single.py +++ b/src/calibre/gui2/convert/single.py @@ -29,7 +29,7 @@ from calibre.ebooks.conversion.plumber import (Plumber, from calibre.ebooks.conversion.config import delete_specifics from calibre.customize.ui import available_output_formats from calibre.customize.conversion import OptionRecommendation -from calibre.utils.config import prefs +from calibre.utils.config import prefs, tweaks from calibre.utils.logging import Log class NoSupportedInputFormats(Exception): @@ -48,6 +48,20 @@ def sort_formats_by_preference(formats, prefs): return len(prefs) return sorted(formats, key=key) +def get_output_formats(preferred_output_format): + all_formats = {x.upper() for x in available_output_formats()} + all_formats.discard('OEB') + pfo = preferred_output_format.upper() if preferred_output_format else '' + restrict = tweaks['restrict_output_formats'] + if restrict: + fmts = [x.upper() for x in restrict] + if pfo and pfo not in fmts and pfo in all_formats: + fmts.append(pfo) + else: + fmts = list(sorted(all_formats, + key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x))) + return fmts + class GroupModel(QAbstractListModel): def __init__(self, widgets): @@ -239,15 +253,13 @@ class Config(ResizableDialog, Ui_Dialog): preferred_output_format): if preferred_output_format: preferred_output_format = preferred_output_format.lower() - output_formats = sorted(available_output_formats(), - key=lambda x:{'EPUB':'!A', 'MOBI':'!B'}.get(x.upper(), x)) - output_formats.remove('oeb') + output_formats = get_output_formats(preferred_output_format) input_format, input_formats = get_input_format_for_book(db, book_id, preferred_input_format) preferred_output_format = preferred_output_format if \ preferred_output_format in output_formats else \ sort_formats_by_preference(output_formats, - prefs['output_format'])[0] + [prefs['output_format']])[0] self.input_formats.addItems(list(map(QString, [x.upper() for x in input_formats]))) self.output_formats.addItems(list(map(QString, [x.upper() for x in From dca69aa470dd84d1967146e1f854b3313eb3a4a1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 17:50:04 +0530 Subject: [PATCH 13/16] Confirm format override when add files to book When adding formats to an existing book, ask for confirmation if some formats will be overwritten. --- src/calibre/gui2/actions/add.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py index 4071ad1468..c5a778f965 100644 --- a/src/calibre/gui2/actions/add.py +++ b/src/calibre/gui2/actions/add.py @@ -110,6 +110,19 @@ class AddAction(InterfaceAction): return db = view.model().db + if len(ids) == 1: + formats = db.formats(ids[0], index_is_id=True) + if formats: + formats = {x.upper() for x in formats.split(',')} + nformats = {f.rpartition('.')[-1].upper() for f in books} + override = formats.intersection(nformats) + if override: + title = db.title(ids[0], index_is_id=True) + msg = _('The {0} format(s) will be replaced in the book {1}. Are you sure?').format( + ', '.join(override), title) + if not confirm(msg, 'confirm_format_override_on_add', title=_('Are you sure'), parent=self.gui): + return + for id_ in ids: for fpath in books: fmt = os.path.splitext(fpath)[1][1:].upper() From 2544d48fbeb68f46b5f9009a818c254ef7881a59 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 19:00:16 +0530 Subject: [PATCH 14/16] A bit of formatting --- manual/conversion.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/manual/conversion.rst b/manual/conversion.rst index c693d0be15..2387fe915a 100644 --- a/manual/conversion.rst +++ b/manual/conversion.rst @@ -772,9 +772,11 @@ size. By default, |app| uses a page size defined by the current :guilabel:`Output profile`. So if your output profile is set to Kindle, |app| will create a PDF with page size suitable for viewing on the small kindle screen. However, if you view this PDF file on a computer screen, then it will -appear to have too large fonts. To create "normal" sized PDFs, use the override -page size option under :guilabel:`PDF Output` in the conversion dialog. +appear to have too large fonts. To create "normal" sized PDFs, use the +:guilabel:`Override page size` option under :guilabel:`PDF Output` in the conversion dialog. +Headers and Footers +^^^^^^^^^^^^^^^^^^^^ You can insert arbitrary headers and footers on each page of the PDF by specifying header and footer templates. Templates are just snippets of HTML code that get rendered in the header and footer locations. For example, to @@ -813,6 +815,9 @@ the page will be used. bottom margins to large enough values, under the Page Setup section of the conversion dialog. +Printable Table of Contents +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + You can also insert a printable Table of Contents at the end of the PDF that lists the page numbers for every section. This is very useful if you intend to print out the PDF to paper. If you wish to use the PDF on an electronic device, From 135a0420b11369a75cbd66a31073e0d9ce5f418d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 21:08:27 +0530 Subject: [PATCH 15/16] Driver for Coby Kyros MID1126 Fixes #1199410 [Unrecognized device Coby Kyros MID1126](https://bugs.launchpad.net/calibre/+bug/1199410) --- src/calibre/devices/android/driver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 31b60389ad..1880324fdc 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -107,7 +107,7 @@ class ANDROID(USBMS): 0x0ff9 : [0x0226], 0xc91 : HTC_BCDS, 0xdddd : [0x216], - 0xdeed : [0x231], + 0xdeed : [0x231, 0x226], }, # Samsung @@ -241,7 +241,7 @@ class ANDROID(USBMS): 'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E', 'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS', 'ICS', 'E400', '__FILE-STOR_GADG', 'ST80208-1', 'GT-S5660M_CARD', 'XT894', '_USB', - 'PROD_TAB13-201', 'URFPAD2', + 'PROD_TAB13-201', 'URFPAD2', 'MID1126', ] WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', @@ -254,7 +254,7 @@ class ANDROID(USBMS): 'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E', 'NOVO7', 'ADVANCED', 'TABLET_PC', 'F', 'E400_SD_CARD', 'ST80208-1', 'XT894', - '_USB', 'PROD_TAB13-201', 'URFPAD2' + '_USB', 'PROD_TAB13-201', 'URFPAD2', 'MID1126', ] OSX_MAIN_MEM = 'Android Device Main Memory' From 0f6161e5baf11bc573731c09e387ba9a5d9f1faf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 22:28:14 +0530 Subject: [PATCH 16/16] Update Houston Chronicle --- recipes/houston_chronicle.recipe | 221 +++++++++++++++++++++++++++---- 1 file changed, 193 insertions(+), 28 deletions(-) diff --git a/recipes/houston_chronicle.recipe b/recipes/houston_chronicle.recipe index ed430aa45a..d7e2ae14c3 100644 --- a/recipes/houston_chronicle.recipe +++ b/recipes/houston_chronicle.recipe @@ -1,41 +1,206 @@ #!/usr/bin/env python -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com' +''' +chron.com +''' +import re, time +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.utils.date import dt_factory, local_tz +from datetime import datetime, timedelta, date +from lxml import html -from calibre.web.feeds.news import BasicNewsRecipe class HoustonChronicle(BasicNewsRecipe): - title = u'The Houston Chronicle' + title = u'The Houston Chronicle' description = 'News from Houston, Texas' - __author__ = 'Kovid Goyal' - language = 'en' - timefmt = ' [%a, %d %b, %Y]' + __author__ = 'Dale Furrow' + language = 'en' no_stylesheets = True - use_embedded_content = False + # use_embedded_content = False remove_attributes = ['style'] - auto_cleanup = True - - oldest_article = 3.0 - - #keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or - #'hst-articletext' in x or 'hst-galleryitem' in x)} + remove_empty_feeds = True + timefmt = '[%a, %d %b %Y]' + timestampfmt = '%Y%m%d%H%M%S' + ignore_duplicate_articles = {'url'} remove_attributes = ['xmlns'] - feeds = [ - ('News', "http://www.chron.com/rss/feed/News-270.php"), - ('Sports', - 'http://www.chron.com/sports/headlines/collectionRss/Sports-Headlines-Staff-Stories-10767.php'), - ('Neighborhood', - 'http://www.chron.com/rss/feed/Neighborhood-305.php'), - ('Business', 'http://www.chron.com/rss/feed/Business-287.php'), - ('Entertainment', - 'http://www.chron.com/rss/feed/Entertainment-293.php'), - ('Editorials', - 'http://www.chron.com/opinion/editorials/collectionRss/Opinion-Editorials-Headline-List-10567.php'), - ('Life', 'http://www.chron.com/rss/feed/Life-297.php'), - ('Science & Tech', - 'http://www.chron.com/rss/feed/AP-Technology-and-Science-266.php'), - ] + remove_tags = [dict(name='div', attrs={'class':'socialBar'}), + dict(name='div', attrs={'class':re.compile('post-commentmeta')}), + dict(name='div', attrs={'class':re.compile('slideshow_wrapper')}), + dict(name='div', attrs={'class':'entry-summary'}), + dict(name='a', attrs={'rel':'item-license'})] + + baseUrl = 'http://www.chron.com' + + oldest_web_article = 7.0 + + if oldest_web_article is None: + earliest_date = date.today() + else: + earliest_date = date.today() - timedelta(days=oldest_web_article) + + pages = [('news' , '/news/houston-texas/'), + ('business' , '/business/'), + ('opinion', '/opinion/'), + ('sports', '/sports/')] + + def getLinksFromSectionPage(self, sectionUrl): + pageDoc = html.parse(sectionUrl) + els = pageDoc.xpath("""//div[contains(@class, 'scp-item') + or @class='scp-feature' or contains(@class, 'simplelist') + or contains(@class, 'scp-blogpromo')] + //a[@href and not(@target) and not(child::img)]""") + elList = [] + for el in els: + link = el.get('href') + title = el.text + if link[:4] != 'http': + link = self.baseUrl + link + if title is not None: + elList.append((link, el.text)) + return elList + + def getArticleDescriptionFromDoc(self, pageDoc): + descriptionCharsBreak = 140 + descriptionMaxChars = 300 + descXpath = """//div[contains(@class, 'article-body') or + contains(@class, 'resource-content') or contains(@class, 'post')]//p""" + sentenceRegex = re.compile("(\S.+?[.!?])(?=\s+|$)") + + def stringify_children(node): + return ''.join([x for x in node.itertext()]) + try: + els = pageDoc.xpath(descXpath) + outText = "" + ellipsis = "" + for el in els: + sentences = re.findall(sentenceRegex, stringify_children(el)) + for sentence in sentences: + if len(outText) < descriptionCharsBreak: + outText += sentence + " " + else: + if len(outText) > descriptionMaxChars: + ellipsis = "..." + return outText[:descriptionMaxChars] + ellipsis + return outText + except: + self.log('Error on Article Description') + return "" + + def getPublishedTimeFromDoc(self, pageDoc): + regexDateOnly = re.compile("""(?:January|February|March|April| + May|June|July|August|September|October|November| + December)\s[0-9]{1,2},\s20[01][0-9]""") + regextTimeOnly = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""") + def getRegularTimestamp(dateString): + try: + outDate = datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ") + return outDate + except: + return None + def getDateFromString(inText): + match = re.findall(regexDateOnly, inText) + if match: + try: + outDate = datetime.strptime(match[0], "%B %d, %Y") + match = re.findall(regextTimeOnly, inText) + if match: + outTime = datetime.strptime(match[0], "%I:%M %p") + return datetime.combine(outDate.date(), outTime.time()) + return outDate + except: + return None + else: + return None + el = pageDoc.xpath("//*[@class='timestamp'][1]") + if len(el) == 1: + return getRegularTimestamp(el[0].get('title')) + else: + el = pageDoc.xpath("//*[@class='entry-date' or @class='post-date'][1]") + if len(el) == 1: + return getDateFromString(el[0].text_content()) + else: + return None + + def getAllFeedDataFromPage(self, page): + articles = [] + linkList = self.getLinksFromSectionPage(self.baseUrl + page[1]) + self.log('from section: ', page[0], " found ", len(linkList), " links") + for link in linkList: + try: + articleDoc = html.parse(link[0]) + description = self.getArticleDescriptionFromDoc(articleDoc) + articleDate = self.getPublishedTimeFromDoc(articleDoc) + if articleDate is not None and description is not None and articleDate.date() > self.earliest_date: + dateText = articleDate.strftime('%a, %d %b') + author = articleDate.strftime(self.timestampfmt) + articles.append({'title':link[1], 'url':link[0], + 'description':description, 'date':dateText, 'author':author}) + self.log(page[0] + ": " + link[1] + ', from ' + dateText + + " description of " + str(len(description)) + ' characters at ' + link[0]) + else: + msg = "" + if articleDate is None: + msg = " No Timestamp Found" + else: + msg = " article older than " + str(self.oldest_web_article) + ' days...' + self.log("Skipping article: ", link[0], msg) + except: + print 'error on fetching ' + link[0] + continue + return articles + + def parse_index(self): + + self.timefmt = ' [%a, %d %b, %Y]' + self.log('starting parse_index: ', time.strftime(self.timestampfmt)) + feeds = [] + for page in self.pages: + articles = [] + articles = self.getAllFeedDataFromPage(page) + if articles: + feeds.append((page[0], articles)) + self.log('finished parse_index: ', time.strftime(self.timestampfmt)) + return feeds + + def preprocess_html(self, thisSoup): + baseTags = [] + baseTags.extend(thisSoup.findAll(name='div', attrs={'id':re.compile('post-\d+')})) + baseTags.extend(thisSoup.findAll(name='div', attrs={'class':'hnews hentry item'})) + allTags = [] + allTags.extend(baseTags) + if len(baseTags) > 0: + for tag in baseTags: + allTags.extend(tag.findAll(True)) + paragraphs = thisSoup.findAll(name='p') + for paragraph in paragraphs: + if paragraph not in allTags: + allTags.append(paragraph) + for tag in baseTags: + while tag.parent is not None: + allTags.append(tag) + tag = tag.parent + for tag in thisSoup.findAll(True): + if tag not in allTags: + tag.extract() + return thisSoup + + def populate_article_metadata(self, article, soup, first): + if not first: + return + try: + article.date = time.strptime(article.author, self.timestampfmt) + article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False) + article.localtime = article.utctime.astimezone(local_tz) + except Exception as inst: # remove after debug + self.log('Exception: ', article.title) # remove after debug + self.log(type(inst)) # remove after debug + self.log(inst) # remove after debug + +