From a7beccd294347debb1c1ae948896e60bd7d928a8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 3 Oct 2011 22:18:28 -0600 Subject: [PATCH 01/58] Fix Slate --- recipes/slate.recipe | 479 ++++++------------------------------------- 1 file changed, 62 insertions(+), 417 deletions(-) diff --git a/recipes/slate.recipe b/recipes/slate.recipe index f2a5b71e3c..36560cdf33 100644 --- a/recipes/slate.recipe +++ b/recipes/slate.recipe @@ -9,285 +9,79 @@ calibre recipe for slate.com import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag class Slate(BasicNewsRecipe): - # Method variables for customizing downloads description = 'A general-interest publication offering analysis and commentary about politics, news and culture.' - __author__ = 'GRiker, Sujata Raman and Nick Redding' - max_articles_per_feed = 100 - oldest_article = 14 - recursions = 0 - delay = 0 - simultaneous_downloads = 5 - timeout = 120.0 + __author__ = 'Kovid Goyal' timefmt = '' - feeds = None no_stylesheets = True - encoding = None language = 'en' + title = 'Slate' + INDEX = 'http://slate.com' + encoding = 'utf-8' + preprocess_regexps = [ + (re.compile(r'', re.DOTALL), lambda x: ''), + (re.compile(r'^.*?]+?/>', re.DOTALL), lambda x:''), + ] + remove_tags = [ + {'name':['link', 'script']}, + {'class':['share-box-flank', 'sl-crumbs', 'sl-tbar', + 'sl-chunky-tbar']}, + ] + remove_tags_after = [{'class':'sl-art-creds-cntr'}] + keep_only_tags = {'class':'sl-body-wrapper'} + remove_attributes = ['style'] - slate_complete = True - if slate_complete: - title = 'Slate (complete)' - else: - title = 'Slate (weekly)' + def print_version(self, url): + return url.replace('.html', '.single.html') - # Method variables for customizing feed parsing - summary_length = 250 - use_embedded_content = None - - # Method variables for pre/post processing of HTML - preprocess_regexps = [ (re.compile(r'

Disclosure: Slate is owned by the Washington Post.*

', - re.DOTALL|re.IGNORECASE), - lambda match: ''), - (re.compile(r'

Join the discussion about this story on.*

', - re.DOTALL|re.IGNORECASE), - lambda match: '') ] - - match_regexps = [] - - # The second entry is for 'Big Money', which comes from a different site, uses different markup - keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}), - dict(attrs={ 'id':['content']}) ] - - # The second entry is for 'Big Money', which comes from a different site, uses different markup - remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper', - 'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio', - 'bizbox_links_bottom','ris_links_wrapper','BOXXLE', - 'comments_button','add_comments_button','comments-to-fray','marriott_ad', - 'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}), - dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ] - - excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast'] - excludedTitleKeywords = ['Gabfest','Slate V','on Twitter'] - excludedAuthorKeywords = [] - excludedContentKeywords = ['http://twitter.com/Slate'] - - extra_css = ''' - .h1_subhead{font-family:Arial; font-size:small; } - h1{font-family:Verdana; font-size:large; } - .byline {font-family:Georgia; margin-bottom: 0px; } - .dateline {font-family:Arial; font-size: smaller; height: 0pt;} - .imagewrapper {font-family:Verdana;font-size:x-small; } - .source {font-family:Verdana; font-size:x-small;} - .credit {font-family:Verdana; font-size: smaller;} - #article_body {font-family:Verdana; } - #content {font-family:Arial; } - .caption{font-family:Verdana;font-style:italic; font-size:x-small;} - h3{font-family:Arial; font-size:small} - ''' - - # Local variables to extend class - baseURL = 'http://slate.com' - section_dates = [] - - # class extension methods - def tag_to_strings(self, tag): - if not tag: - return '' - if isinstance(tag, basestring): - return tag - strings = [] - for item in tag.contents: - if isinstance(item, (NavigableString, CData)): - strings.append(item.string) - elif isinstance(item, Tag): - res = self.tag_to_string(item,use_alt=False) - if res: - strings.append(res) - return strings - - def extract_named_sections(self): - soup = self.index_to_soup( self.baseURL ) - soup_nav_bar = soup.find(True, attrs={'id':'nav'}) - briefing_nav = soup.find('li') - briefing_url = briefing_nav.a['href'] - for section_nav in soup_nav_bar.findAll('li'): - section_name = self.tag_to_string(section_nav,use_alt=False) - self.section_dates.append(section_name) - - soup = self.index_to_soup(briefing_url) - - self.log("Briefing url = %s " % briefing_url) - section_lists = soup.findAll('ul','view_links_list') - - sections = [] - for section in section_lists : - sections.append(section) - return sections - - - def extract_dated_sections(self): - soup = self.index_to_soup( self.baseURL ) - soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'}) - if soup_top_stories: - self.section_dates.append("Top Stories") - self.log("SELECTION TOP STORIES %s" % "Top Stories") - - soup = soup.find(True, attrs={'id':'toc_links_container'}) - - todays_section = soup.find(True, attrs={'class':'todaydateline'}) - self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) - self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False)) - - older_section_dates = soup.findAll(True, attrs={'class':'maindateline'}) - for older_section in older_section_dates : - self.section_dates.append(self.tag_to_string(older_section,use_alt=False)) - self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False)) - - if soup_top_stories: - headline_stories = soup_top_stories - self.log("HAVE top_stories") - else: - headline_stories = None - self.log("NO top_stories") - section_lists = soup.findAll('ul') - # Prepend the headlines to the first section - if headline_stories: - section_lists.insert(0,headline_stories) - - sections = [] - for section in section_lists : - sections.append(section) - return sections - - - def extract_section_articles(self, sections_html) : - # Find the containers with section content - sections = sections_html - - articles = {} - key = None + def parse_index(self) : ans = [] - - for (i,section) in enumerate(sections) : - - # Get the section name - if section.has_key('id') : - self.log("PROCESSING SECTION id = %s" % section['id']) - key = self.section_dates[i] - if key.startswith("Pod"): - continue - if key.startswith("Blog"): - continue - articles[key] = [] - ans.append(key) - elif self.slate_complete: - key = self.section_dates[i] - if key.startswith("Pod"): - continue - if key.startswith("Blog"): - continue - self.log("PROCESSING SECTION name = %s" % key) - articles[key] = [] - ans.append(key) - else : - self.log("SECTION %d HAS NO id" % i); - continue - - # Get the section article_list - article_list = section.findAll('li') - - # Extract the article attributes - for article in article_list : - bylines = self.tag_to_strings(article) - url = article.a['href'] - title = bylines[0] - full_title = self.tag_to_string(article,use_alt=False) - #self.log("ARTICLE TITLE%s" % title) - #self.log("ARTICLE FULL_TITLE%s" % full_title) - #self.log("URL %s" % url) - author = None - description = None - pubdate = None - - if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 : - description = "A summary of what's in the major U.S. newspapers." - - if len(bylines) == 3 : - author = bylines[2].strip() - author = re.sub('[\r][\n][\t][\t\t]','', author) - author = re.sub(',','', author) - if bylines[1] is not None : - description = bylines[1] - full_byline = self.tag_to_string(article) - if full_byline.find('major U.S. newspapers') > 0 : - description = "A summary of what's in the major U.S. newspapers." - - if len(bylines) > 3 and author is not None: - author += " | " - for (i,substring) in enumerate(bylines[3:]) : - #print "substring: %s" % substring.encode('cp1252') - author += substring.strip() - if i < len(bylines[3:]) : - author += " | " - - # Skip articles whose descriptions contain excluded keywords - if description is not None and len(self.excludedDescriptionKeywords): - excluded = re.compile('|'.join(self.excludedDescriptionKeywords)) - found_excluded = excluded.search(description) - if found_excluded : - self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) - continue - - # Skip articles whose title contain excluded keywords - if full_title is not None and len(self.excludedTitleKeywords): - excluded = re.compile('|'.join(self.excludedTitleKeywords)) - #self.log("evaluating full_title: %s" % full_title) - found_excluded = excluded.search(full_title) - if found_excluded : - self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) - continue - - # Skip articles whose author contain excluded keywords - if author is not None and len(self.excludedAuthorKeywords): - excluded = re.compile('|'.join(self.excludedAuthorKeywords)) - found_excluded = excluded.search(author) - if found_excluded : - self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0))) - continue - - skip_this_article = False - # Check to make sure we're not adding a duplicate - for article in articles[key] : - if article['url'] == url : - skip_this_article = True - self.log("SKIPPING DUP %s" % url) - break - - if skip_this_article : - continue - - # Build the dictionary entry for this article - feed = key - if not articles.has_key(feed) : - articles[feed] = [] - articles[feed].append(dict(title=title, url=url, date=pubdate, description=description, - author=author, content='')) - #self.log("KEY %s" % feed) - #self.log("APPENDED %s" % url) - # Promote 'newspapers' to top - for (i,article) in enumerate(articles[feed]) : - if article['description'] is not None : - if article['description'].find('newspapers') > 0 : - articles[feed].insert(0,articles[feed].pop(i)) - - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + for sectitle, url in ( + ('News & Politics', '/articles/news_and_politics.html'), + ('Technology', '/articles/technology.html'), + ('Business', '/articles/business.html'), + ('Arts', '/articles/arts.html'), + ('Life', '/articles/life.html'), + ('Health & Science', '/articles/health_and_science.html'), + ('Sports', '/articles/sports.html'), + ('Double X', '/articles/double_x.html'), + ): + url = self.INDEX + url + self.log('Found section:', sectitle) + articles = self.slate_section_articles(self.index_to_soup(url)) + if articles: + ans.append((sectitle, articles)) return ans - def print_version(self, url) : - return url + 'pagenum/all/' - - # Class methods - def parse_index(self) : - if self.slate_complete: - sections = self.extract_named_sections() - else: - sections = self.extract_dated_sections() - section_list = self.extract_section_articles(sections) - return section_list + def slate_section_articles(self, soup): + cont = soup.find('div', id='most_read') + seen = set() + ans = [] + for h4 in cont.findAll('h4'): + a = h4.find('a', href=True) + if a is None: continue + url = a['href'] + if url.startswith('/'): + url = self.INDEX + url + if url in seen: continue + seen.add(url) + title = self.tag_to_string(a) + parent = h4.parent + h3 = parent.find('h3') + desc = '' + if h3 is not None: + desc = self.tag_to_string(h3) + a = parent.find('a', rel='author') + if a is not None: + a = self.tag_to_string(a) + art = {'title':title, 'description':desc, 'date':'', 'url':url} + if a: + art['author'] = a + self.log('\tFound article:', title, ' by ', a) + ans.append(art) + return ans def get_masthead_url(self): masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif' @@ -299,153 +93,4 @@ class Slate(BasicNewsRecipe): masthead = None return masthead - def stripAnchors(self,soup): - body = soup.find('div',attrs={'id':['article_body','content']}) - if body is not None: - paras = body.findAll('p') - if paras is not None: - for para in paras: - aTags = para.findAll('a') - if aTags is not None: - for a in aTags: - if a.img is None: - #print repr(a.renderContents()) - a.replaceWith(a.renderContents().decode('utf-8','replace')) - return soup - - def preprocess_html(self, soup) : - - # Remove 'grayPlus4.png' images - imgs = soup.findAll('img') - if imgs is not None: - for img in imgs: - if re.search("grayPlus4.png",str(img)): - img.extract() - - # Delete article based upon content keywords - if len(self.excludedDescriptionKeywords): - excluded = re.compile('|'.join(self.excludedContentKeywords)) - found_excluded = excluded.search(str(soup)) - if found_excluded : - print "No allowed content found, removing article" - raise Exception('Rejected article') - - # Articles from www.thebigmoney.com use different tagging for byline, dateline and body - head = soup.find('head') - if head.link is not None and re.search('www\.thebigmoney\.com', str(head)): - byline = soup.find('div',attrs={'id':'byline'}) - if byline is not None: - byline['class'] = byline['id'] - - dateline = soup.find('div',attrs={'id':'dateline'}) - if dateline is not None: - dateline['class'] = dateline['id'] - - body = soup.find('div',attrs={'id':'content'}) - if body is not None: - body['class'] = 'article_body' - - # Synthesize a department kicker - h3Tag = Tag(soup,'h3') - emTag = Tag(soup,'em') - emTag.insert(0,NavigableString("the big money: Today's business press")) - h3Tag.insert(0,emTag) - soup.body.insert(0,h3Tag) - - # Strip anchors from HTML - return self.stripAnchors(soup) - - def postprocess_html(self, soup, first_fetch) : - - # Fix up dept_kicker as

- dept_kicker = soup.find('div', attrs={'class':'department_kicker'}) - if dept_kicker is not None : - kicker_strings = self.tag_to_strings(dept_kicker) - kicker = ''.join(kicker_strings[2:]) - kicker = re.sub('\.','',kicker) - h3Tag = Tag(soup, "h3") - emTag = Tag(soup, "em") - emTag.insert(0,NavigableString(kicker)) - h3Tag.insert(0, emTag) - dept_kicker.replaceWith(h3Tag) - else: - self.log("No kicker--return null") - return None - - # Fix up the concatenated byline and dateline - byline = soup.find(True,attrs={'class':'byline'}) - if byline is not None : - bylineTag = Tag(soup,'div') - bylineTag['class'] = 'byline' - #bylineTag['height'] = '0em' - bylineTag.insert(0,self.tag_to_string(byline)) - byline.replaceWith(bylineTag) - - dateline = soup.find(True, attrs={'class':'dateline'}) - if dateline is not None : - datelineTag = Tag(soup, 'div') - datelineTag['class'] = 'dateline' - #datelineTag['margin-top'] = '0em' - datelineTag.insert(0,self.tag_to_string(dateline)) - dateline.replaceWith(datelineTag) - - # Change captions to italic, add
- for caption in soup.findAll(True, {'class':'caption'}) : - if caption is not None: - emTag = Tag(soup, "em") - emTag.insert(0, '
' + self.tag_to_string(caption)) - hrTag = Tag(soup, 'hr') - emTag.insert(1, hrTag) - caption.replaceWith(emTag) - - # Fix photos - for photo in soup.findAll('span',attrs={'class':'imagewrapper'}): - if photo.a is not None and photo.a.img is not None: - divTag = Tag(soup,'div') - divTag['class'] ='imagewrapper' - divTag.insert(0,photo.a.img) - photo.replaceWith(divTag) - - return soup - - def postprocess_book(self, oeb, opts, log) : - - def extract_byline(href) : - soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) - byline = soup.find(True,attrs={'class':'byline'}) - if byline is not None: - return self.tag_to_string(byline,use_alt=False) - else : - return None - - def extract_description(href) : - soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) - paragraphs = soup.findAll('p') - for p in paragraphs : - if self.tag_to_string(p,use_alt=False).startswith('By ') or \ - self.tag_to_string(p,use_alt=False).startswith('Posted '): - continue - comment = p.find(text=lambda text:isinstance(text, Comment)) - if comment is not None: - continue - else: - return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...' - - return None - - # Method entry point here - # Single section toc looks different than multi-section tocs - if oeb.toc.depth() == 2 : - for article in oeb.toc : - if article.author is None : - article.author = extract_byline(article.href) - if article.description is None : - article.description = extract_description(article.href) - elif oeb.toc.depth() == 3 : - for section in oeb.toc : - for article in section : - if article.author is None : - article.author = extract_byline(article.href) - if article.description is None : - article.description = extract_description(article.href) From e02b1ace034a234135525e77ad5a4a6bc4edd9eb Mon Sep 17 00:00:00 2001 From: Anthon Date: Tue, 4 Oct 2011 08:34:42 +0200 Subject: [PATCH 02/58] Added a check_call method that prints out the commandline handed to subprocess.check_call if that call fails. This helps quickly locating which external commands are missing (without having to look at the source code). E.g. if you don't know that the qmc variable specifies the qmake command. --- setup/extensions.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/setup/extensions.py b/setup/extensions.py index 87d05c492d..ccff4b6ff7 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -336,7 +336,7 @@ class Build(Command): oinc = ['/Fo'+obj] if iswindows else ['-o', obj] cmd = [compiler] + cflags + ext.cflags + einc + sinc + oinc self.info(' '.join(cmd)) - subprocess.check_call(cmd) + self.check_call(cmd) dest = self.dest(ext) elib = self.lib_dirs_to_ldflags(ext.lib_dirs) @@ -350,18 +350,32 @@ class Build(Command): else: cmd += objects + ext.extra_objs + ['-o', dest] + ldflags + ext.ldflags + elib + xlib self.info('\n\n', ' '.join(cmd), '\n\n') - subprocess.check_call(cmd) + self.check_call(cmd) if iswindows: #manifest = dest+'.manifest' #cmd = [MT, '-manifest', manifest, '-outputresource:%s;2'%dest] #self.info(*cmd) - #subprocess.check_call(cmd) + #self.check_call(cmd) #os.remove(manifest) for x in ('.exp', '.lib'): x = os.path.splitext(dest)[0]+x if os.path.exists(x): os.remove(x) + def check_call(self, *args, **kwargs): + """print cmdline if an error occured + + If something is missing (qmake e.g.) you get a non-informative error + self.check_call(qmc + [ext.name+'.pro']) + so you would have to look a the source to see the actual command. + """ + try: + subprocess.check_call(*args, **kwargs) + except: + cmdline = ' '.join(['"%s"' % (arg) if ' ' in arg else arg for arg in args[0]]) + print "Error while executing: %s\n" % (cmdline) + raise + def build_qt_objects(self, ext): obj_pat = 'release\\*.obj' if iswindows else '*.o' objects = glob.glob(obj_pat) @@ -380,8 +394,8 @@ class Build(Command): qmc = [QMAKE, '-o', 'Makefile'] if iswindows: qmc += ['-spec', 'win32-msvc2008'] - subprocess.check_call(qmc + [ext.name+'.pro']) - subprocess.check_call([make, '-f', 'Makefile']) + self.check_call(qmc + [ext.name+'.pro']) + self.check_call([make, '-f', 'Makefile']) objects = glob.glob(obj_pat) return list(map(self.a, objects)) @@ -407,7 +421,7 @@ class Build(Command): cmd = [pyqt.sip_bin+exe, '-w', '-c', src_dir, '-b', sbf, '-I'+\ pyqt.pyqt_sip_dir] + shlex.split(pyqt.pyqt_sip_flags) + [sipf] self.info(' '.join(cmd)) - subprocess.check_call(cmd) + self.check_call(cmd) module = self.j(src_dir, self.b(dest)) if self.newer(dest, [sbf]+qt_objects): mf = self.j(src_dir, 'Makefile') @@ -417,7 +431,7 @@ class Build(Command): makefile.extra_include_dirs = ext.inc_dirs makefile.generate() - subprocess.check_call([make, '-f', mf], cwd=src_dir) + self.check_call([make, '-f', mf], cwd=src_dir) shutil.copy2(module, dest) def clean(self): @@ -457,7 +471,7 @@ class BuildPDF2XML(Command): cmd += ['-I'+x for x in poppler_inc_dirs+magick_inc_dirs] cmd += ['/Fo'+obj, src] self.info(*cmd) - subprocess.check_call(cmd) + self.check_call(cmd) objects.append(obj) if self.newer(dest, objects): @@ -470,7 +484,7 @@ class BuildPDF2XML(Command): png_libs+magick_libs+poppler_libs+ft_libs+jpg_libs+pdfreflow_libs] cmd += ['/OUT:'+dest] + objects self.info(*cmd) - subprocess.check_call(cmd) + self.check_call(cmd) self.info('Binary installed as', dest) From e46e67949a8ee12322c4b7069eeadc3f7a230a6d Mon Sep 17 00:00:00 2001 From: Anthon Date: Tue, 4 Oct 2011 10:11:37 +0200 Subject: [PATCH 03/58] added DJVU Input converter support. The djvu.py and djvubzzdec.py files will probably make it into an installable library of their own, until then I will keep them updated. The DJVU files need to have the OCR-ed layer added (as per DJVU Spec rev 3) This has been tested with files ocr-ed with LizardTech's Document Express Editor. It uses ascii \037 and \035 to mark paragraphs. Other OCR programs might need adapting. --- src/calibre/customize/builtins.py | 2 + src/calibre/ebooks/djvu/__init__.py | 9 + src/calibre/ebooks/djvu/djvu.py | 135 +++++ src/calibre/ebooks/djvu/djvubzzdec.py | 740 ++++++++++++++++++++++++++ src/calibre/ebooks/djvu/input.py | 83 +++ 5 files changed, 969 insertions(+) create mode 100644 src/calibre/ebooks/djvu/__init__.py create mode 100644 src/calibre/ebooks/djvu/djvu.py create mode 100644 src/calibre/ebooks/djvu/djvubzzdec.py create mode 100644 src/calibre/ebooks/djvu/input.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index a2c0596e0b..1267808296 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -502,6 +502,7 @@ class TXTZMetadataWriter(MetadataWriterPlugin): # }}} from calibre.ebooks.comic.input import ComicInput +from calibre.ebooks.djvu.input import DJVUInput from calibre.ebooks.epub.input import EPUBInput from calibre.ebooks.fb2.input import FB2Input from calibre.ebooks.html.input import HTMLInput @@ -599,6 +600,7 @@ plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon] plugins += [ ComicInput, + DJVUInput, EPUBInput, FB2Input, HTMLInput, diff --git a/src/calibre/ebooks/djvu/__init__.py b/src/calibre/ebooks/djvu/__init__.py new file mode 100644 index 0000000000..1a27624f6b --- /dev/null +++ b/src/calibre/ebooks/djvu/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2011, Anthon van der Neut ' +__docformat__ = 'restructuredtext en' + +''' +Used for DJVU input +''' + diff --git a/src/calibre/ebooks/djvu/djvu.py b/src/calibre/ebooks/djvu/djvu.py new file mode 100644 index 0000000000..8a9282f83e --- /dev/null +++ b/src/calibre/ebooks/djvu/djvu.py @@ -0,0 +1,135 @@ +#! /usr/bin/env python +# coding: utf-8 + +__license__ = 'GPL v3' +__copyright__ = '2011, Anthon van der Neut ' + +# this code is based on: +# Lizardtech DjVu Reference +# DjVu v3 +# November 2005 + +from ruamel.util.program import Program, CountAction + +import os +import sys +import struct +from cStringIO import StringIO + +from .djvubzzdec import BZZDecoder + +class DjvuChunk(object): + def __init__(self, buf, start, end, align=True, bigendian=True, inclheader=False, verbose=0): + self.subtype = None + self._subchunks = [] + self.buf = buf + pos = start + 4 + self.type = buf[start:pos] + self.align = align # whether to align to word (2-byte) boundaries + self.headersize = 0 if inclheader else 8 + if bigendian: + self.strflag = '>' + else: + self.strflag = '<' + oldpos, pos = pos, pos+4 + self.size = struct.unpack(self.strflag+'L', buf[oldpos:pos])[0] + self.dataend = pos + self.size - (8 if inclheader else 0) + if self.type == 'FORM': + oldpos, pos = pos, pos+4 + #print oldpos, pos + self.subtype = buf[oldpos:pos] + #self.headersize += 4 + self.datastart = pos + if verbose > 0: + print 'found', self.type, self.subtype, pos, self.size + if self.type in 'FORM'.split(): + if verbose > 0: + print 'processing substuff %d %d (%x)' % (pos, self.dataend, self.dataend) + numchunks = 0 + while pos < self.dataend: + x = DjvuChunk(buf, pos, start+self.size, verbose=verbose) + numchunks += 1 + self._subchunks.append(x) + newpos = pos + x.size + x.headersize + (1 if (x.size % 2) else 0) + if verbose > 0: + print 'newpos %d %d (%x, %x) %d' % (newpos, self.dataend, newpos, self.dataend, x.headersize) + pos = newpos + if verbose > 0: + print ' end of chunk %d (%x)' % (pos, pos) + + def dump(self, verbose=0, indent=1, out=None): + if out is None: + out = sys.stdout + if verbose > 0: + out.write(' ' * indent) + out.write('%s%s [%d]\n' % (self.type, ':' + self.subtype if self.subtype else '', self.size)) + if self.type == 'TXTz': + inbuf = StringIO(self.buf[self.datastart: self.dataend]) + outbuf = StringIO() + decoder = BZZDecoder(inbuf, outbuf) + while True: + xxres = decoder.convert(1024 * 1024) + if not xxres: + break + res = outbuf.getvalue() + l = 0 + for x in res[:3]: + l <<= 8 + l += ord(x) + if verbose > 0: + print >> out, l + out.write(res[3:3+l]) + out.write('\n\f') + if self.type == 'TXTa': + res = self.buf[self.datastart: self.dataend] + l = 0 + for x in res[:3]: + l <<= 8 + l += ord(x) + if verbose > 0: + print >> out, l + out.write(res[3:3+l]) + out.write('\n\f') + for schunk in self._subchunks: + schunk.dump(verbose=verbose, indent=indent+1, out=out) + +class DJVUFile(object): + def __init__(self, instream): + self.instream = instream + buf = self.instream.read(4) + assert(buf == 'AT&T') + buf = self.instream.read() + self.dc = DjvuChunk(buf, 0, len(buf)) + + def get_text(self, outfile=None): + self.dc.dump(out=outfile) + +def main(): + from ruamel.util.program import Program, CountAction + class DJVUDecoder(Program): + def __init__(self): + Program.__init__(self) + + def parser_setup(self): + Program.parser_setup(self) + #self._argparser.add_argument('--combine', '-c', action=CountAction, const=1, nargs=0) + #self._argparser.add_argument('--combine', '-c', type=int, default=1) + #self._argparser.add_argument('--segments', '-s', action='append', nargs='+') + #self._argparser.add_argument('--force', '-f', action='store_true') + #self._argparser.add_argument('classname') + self._argparser.add_argument('file', nargs='+') + + def run(self): + if self._args.verbose > 1: # can be negative with --quiet + print self._args.file + x = DJVUFile(file(self._args.file[0], 'rb')) + x.get_text() + return 0 + + tt = DJVUDecoder() + res = tt.result + if res != 0: + print res + +if __name__ == '__main__': + main() diff --git a/src/calibre/ebooks/djvu/djvubzzdec.py b/src/calibre/ebooks/djvu/djvubzzdec.py new file mode 100644 index 0000000000..e4cb70807a --- /dev/null +++ b/src/calibre/ebooks/djvu/djvubzzdec.py @@ -0,0 +1,740 @@ +#! /usr/bin/env python +# coding: utf-8 + +__license__ = 'GPL v3' +__copyright__ = '2011, Anthon van der Neut ' +#__docformat__ = 'restructuredtext en' + +# Copyright (C) 2011 Anthon van der Neut, Ruamel bvba +# Adapted from Leon Bottou's djvulibre C++ code, +# ( ZPCodec.{cpp,h} and BSByteStream.{cpp,h} ) +# that code was first converted to C removing any dependencies on the DJVU libre +# framework for ByteStream, making it into a ctypes callable shared object +# then to python, and remade into a class +original_copyright_notice = ''' +//C- ------------------------------------------------------------------- +//C- DjVuLibre-3.5 +//C- Copyright (c) 2002 Leon Bottou and Yann Le Cun. +//C- Copyright (c) 2001 AT&T +//C- +//C- This software is subject to, and may be distributed under, the +//C- GNU General Public License, either Version 2 of the license, +//C- or (at your option) any later version. The license should have +//C- accompanied the software or you may obtain a copy of the license +//C- from the Free Software Foundation at http://www.fsf.org . +//C- +//C- This program is distributed in the hope that it will be useful, +//C- but WITHOUT ANY WARRANTY; without even the implied warranty of +//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//C- GNU General Public License for more details. +//C- +//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from +//C- Lizardtech Software. Lizardtech Software has authorized us to +//C- replace the original DjVu(r) Reference Library notice by the following +//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu): +//C- +//C- ------------------------------------------------------------------ +//C- | DjVu (r) Reference Library (v. 3.5) +//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved. +//C- | The DjVu Reference Library is protected by U.S. Pat. No. +//C- | 6,058,214 and patents pending. +//C- | +//C- | This software is subject to, and may be distributed under, the +//C- | GNU General Public License, either Version 2 of the license, +//C- | or (at your option) any later version. The license should have +//C- | accompanied the software or you may obtain a copy of the license +//C- | from the Free Software Foundation at http://www.fsf.org . +//C- | +//C- | The computer code originally released by LizardTech under this +//C- | license and unmodified by other parties is deemed "the LIZARDTECH +//C- | ORIGINAL CODE." Subject to any third party intellectual property +//C- | claims, LizardTech grants recipient a worldwide, royalty-free, +//C- | non-exclusive license to make, use, sell, or otherwise dispose of +//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the +//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU +//C- | General Public License. This grant only confers the right to +//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to +//C- | the extent such infringement is reasonably necessary to enable +//C- | recipient to make, have made, practice, sell, or otherwise dispose +//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to +//C- | any greater extent that may be necessary to utilize further +//C- | modifications or combinations. +//C- | +//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY +//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF +//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. +//C- +------------------------------------------------------------------ +// +// $Id: BSByteStream.cpp,v 1.9 2007/03/25 20:48:29 leonb Exp $ +// $Name: release_3_5_23 $ +''' + + +MAXBLOCK = 4096 +FREQMAX = 4 +CTXIDS = 3 +MAXLEN = 1024 ** 2 + +# Exception classes used by this module. +class BZZDecoderError(Exception): + """This exception is raised when BZZDecode runs into trouble + """ + def __init__(self, msg): + self.msg = msg + def __str__(self): + return "BZZDecoderError: %s" % (self.msg) + + +# This table has been designed for the ZPCoder +# * by running the following command in file 'zptable.sn': +# * (fast-crude (steady-mat 0.0035 0.0002) 260))) +default_ztable = [ + (0x8000, 0x0000, 84, 145), # 000: p=0.500000 ( 0, 0) + (0x8000, 0x0000, 3, 4), # 001: p=0.500000 ( 0, 0) + (0x8000, 0x0000, 4, 3), # 002: p=0.500000 ( 0, 0) + (0x6bbd, 0x10a5, 5, 1), # 003: p=0.465226 ( 0, 0) + (0x6bbd, 0x10a5, 6, 2), # 004: p=0.465226 ( 0, 0) + (0x5d45, 0x1f28, 7, 3), # 005: p=0.430708 ( 0, 0) + (0x5d45, 0x1f28, 8, 4), # 006: p=0.430708 ( 0, 0) + (0x51b9, 0x2bd3, 9, 5), # 007: p=0.396718 ( 0, 0) + (0x51b9, 0x2bd3, 10, 6), # 008: p=0.396718 ( 0, 0) + (0x4813, 0x36e3, 11, 7), # 009: p=0.363535 ( 0, 0) + (0x4813, 0x36e3, 12, 8), # 010: p=0.363535 ( 0, 0) + (0x3fd5, 0x408c, 13, 9), # 011: p=0.331418 ( 0, 0) + (0x3fd5, 0x408c, 14, 10), # 012: p=0.331418 ( 0, 0) + (0x38b1, 0x48fd, 15, 11), # 013: p=0.300585 ( 0, 0) + (0x38b1, 0x48fd, 16, 12), # 014: p=0.300585 ( 0, 0) + (0x3275, 0x505d, 17, 13), # 015: p=0.271213 ( 0, 0) + (0x3275, 0x505d, 18, 14), # 016: p=0.271213 ( 0, 0) + (0x2cfd, 0x56d0, 19, 15), # 017: p=0.243438 ( 0, 0) + (0x2cfd, 0x56d0, 20, 16), # 018: p=0.243438 ( 0, 0) + (0x2825, 0x5c71, 21, 17), # 019: p=0.217391 ( 0, 0) + (0x2825, 0x5c71, 22, 18), # 020: p=0.217391 ( 0, 0) + (0x23ab, 0x615b, 23, 19), # 021: p=0.193150 ( 0, 0) + (0x23ab, 0x615b, 24, 20), # 022: p=0.193150 ( 0, 0) + (0x1f87, 0x65a5, 25, 21), # 023: p=0.170728 ( 0, 0) + (0x1f87, 0x65a5, 26, 22), # 024: p=0.170728 ( 0, 0) + (0x1bbb, 0x6962, 27, 23), # 025: p=0.150158 ( 0, 0) + (0x1bbb, 0x6962, 28, 24), # 026: p=0.150158 ( 0, 0) + (0x1845, 0x6ca2, 29, 25), # 027: p=0.131418 ( 0, 0) + (0x1845, 0x6ca2, 30, 26), # 028: p=0.131418 ( 0, 0) + (0x1523, 0x6f74, 31, 27), # 029: p=0.114460 ( 0, 0) + (0x1523, 0x6f74, 32, 28), # 030: p=0.114460 ( 0, 0) + (0x1253, 0x71e6, 33, 29), # 031: p=0.099230 ( 0, 0) + (0x1253, 0x71e6, 34, 30), # 032: p=0.099230 ( 0, 0) + (0x0fcf, 0x7404, 35, 31), # 033: p=0.085611 ( 0, 0) + (0x0fcf, 0x7404, 36, 32), # 034: p=0.085611 ( 0, 0) + (0x0d95, 0x75d6, 37, 33), # 035: p=0.073550 ( 0, 0) + (0x0d95, 0x75d6, 38, 34), # 036: p=0.073550 ( 0, 0) + (0x0b9d, 0x7768, 39, 35), # 037: p=0.062888 ( 0, 0) + (0x0b9d, 0x7768, 40, 36), # 038: p=0.062888 ( 0, 0) + (0x09e3, 0x78c2, 41, 37), # 039: p=0.053539 ( 0, 0) + (0x09e3, 0x78c2, 42, 38), # 040: p=0.053539 ( 0, 0) + (0x0861, 0x79ea, 43, 39), # 041: p=0.045365 ( 0, 0) + (0x0861, 0x79ea, 44, 40), # 042: p=0.045365 ( 0, 0) + (0x0711, 0x7ae7, 45, 41), # 043: p=0.038272 ( 0, 0) + (0x0711, 0x7ae7, 46, 42), # 044: p=0.038272 ( 0, 0) + (0x05f1, 0x7bbe, 47, 43), # 045: p=0.032174 ( 0, 0) + (0x05f1, 0x7bbe, 48, 44), # 046: p=0.032174 ( 0, 0) + (0x04f9, 0x7c75, 49, 45), # 047: p=0.026928 ( 0, 0) + (0x04f9, 0x7c75, 50, 46), # 048: p=0.026928 ( 0, 0) + (0x0425, 0x7d0f, 51, 47), # 049: p=0.022444 ( 0, 0) + (0x0425, 0x7d0f, 52, 48), # 050: p=0.022444 ( 0, 0) + (0x0371, 0x7d91, 53, 49), # 051: p=0.018636 ( 0, 0) + (0x0371, 0x7d91, 54, 50), # 052: p=0.018636 ( 0, 0) + (0x02d9, 0x7dfe, 55, 51), # 053: p=0.015421 ( 0, 0) + (0x02d9, 0x7dfe, 56, 52), # 054: p=0.015421 ( 0, 0) + (0x0259, 0x7e5a, 57, 53), # 055: p=0.012713 ( 0, 0) + (0x0259, 0x7e5a, 58, 54), # 056: p=0.012713 ( 0, 0) + (0x01ed, 0x7ea6, 59, 55), # 057: p=0.010419 ( 0, 0) + (0x01ed, 0x7ea6, 60, 56), # 058: p=0.010419 ( 0, 0) + (0x0193, 0x7ee6, 61, 57), # 059: p=0.008525 ( 0, 0) + (0x0193, 0x7ee6, 62, 58), # 060: p=0.008525 ( 0, 0) + (0x0149, 0x7f1a, 63, 59), # 061: p=0.006959 ( 0, 0) + (0x0149, 0x7f1a, 64, 60), # 062: p=0.006959 ( 0, 0) + (0x010b, 0x7f45, 65, 61), # 063: p=0.005648 ( 0, 0) + (0x010b, 0x7f45, 66, 62), # 064: p=0.005648 ( 0, 0) + (0x00d5, 0x7f6b, 67, 63), # 065: p=0.004506 ( 0, 0) + (0x00d5, 0x7f6b, 68, 64), # 066: p=0.004506 ( 0, 0) + (0x00a5, 0x7f8d, 69, 65), # 067: p=0.003480 ( 0, 0) + (0x00a5, 0x7f8d, 70, 66), # 068: p=0.003480 ( 0, 0) + (0x007b, 0x7faa, 71, 67), # 069: p=0.002602 ( 0, 0) + (0x007b, 0x7faa, 72, 68), # 070: p=0.002602 ( 0, 0) + (0x0057, 0x7fc3, 73, 69), # 071: p=0.001843 ( 0, 0) + (0x0057, 0x7fc3, 74, 70), # 072: p=0.001843 ( 0, 0) + (0x003b, 0x7fd7, 75, 71), # 073: p=0.001248 ( 0, 0) + (0x003b, 0x7fd7, 76, 72), # 074: p=0.001248 ( 0, 0) + (0x0023, 0x7fe7, 77, 73), # 075: p=0.000749 ( 0, 0) + (0x0023, 0x7fe7, 78, 74), # 076: p=0.000749 ( 0, 0) + (0x0013, 0x7ff2, 79, 75), # 077: p=0.000402 ( 0, 0) + (0x0013, 0x7ff2, 80, 76), # 078: p=0.000402 ( 0, 0) + (0x0007, 0x7ffa, 81, 77), # 079: p=0.000153 ( 0, 0) + (0x0007, 0x7ffa, 82, 78), # 080: p=0.000153 ( 0, 0) + (0x0001, 0x7fff, 81, 79), # 081: p=0.000027 ( 0, 0) + (0x0001, 0x7fff, 82, 80), # 082: p=0.000027 ( 0, 0) + (0x5695, 0x0000, 9, 85), # 083: p=0.411764 ( 2, 3) + (0x24ee, 0x0000, 86, 226), # 084: p=0.199988 ( 1, 0) + (0x8000, 0x0000, 5, 6), # 085: p=0.500000 ( 3, 3) + (0x0d30, 0x0000, 88, 176), # 086: p=0.071422 ( 4, 0) + (0x481a, 0x0000, 89, 143), # 087: p=0.363634 ( 1, 2) + (0x0481, 0x0000, 90, 138), # 088: p=0.024388 ( 13, 0) + (0x3579, 0x0000, 91, 141), # 089: p=0.285711 ( 1, 3) + (0x017a, 0x0000, 92, 112), # 090: p=0.007999 ( 41, 0) + (0x24ef, 0x0000, 93, 135), # 091: p=0.199997 ( 1, 5) + (0x007b, 0x0000, 94, 104), # 092: p=0.002611 ( 127, 0) + (0x1978, 0x0000, 95, 133), # 093: p=0.137929 ( 1, 8) + (0x0028, 0x0000, 96, 100), # 094: p=0.000849 ( 392, 0) + (0x10ca, 0x0000, 97, 129), # 095: p=0.090907 ( 1, 13) + (0x000d, 0x0000, 82, 98), # 096: p=0.000276 ( 1208, 0) + (0x0b5d, 0x0000, 99, 127), # 097: p=0.061537 ( 1, 20) + (0x0034, 0x0000, 76, 72), # 098: p=0.001102 ( 1208, 1) + (0x078a, 0x0000, 101, 125), # 099: p=0.040815 ( 1, 31) + (0x00a0, 0x0000, 70, 102), # 100: p=0.003387 ( 392, 1) + (0x050f, 0x0000, 103, 123), # 101: p=0.027397 ( 1, 47) + (0x0117, 0x0000, 66, 60), # 102: p=0.005912 ( 392, 2) + (0x0358, 0x0000, 105, 121), # 103: p=0.018099 ( 1, 72) + (0x01ea, 0x0000, 106, 110), # 104: p=0.010362 ( 127, 1) + (0x0234, 0x0000, 107, 119), # 105: p=0.011940 ( 1, 110) + (0x0144, 0x0000, 66, 108), # 106: p=0.006849 ( 193, 1) + (0x0173, 0x0000, 109, 117), # 107: p=0.007858 ( 1, 168) + (0x0234, 0x0000, 60, 54), # 108: p=0.011925 ( 193, 2) + (0x00f5, 0x0000, 111, 115), # 109: p=0.005175 ( 1, 256) + (0x0353, 0x0000, 56, 48), # 110: p=0.017995 ( 127, 2) + (0x00a1, 0x0000, 69, 113), # 111: p=0.003413 ( 1, 389) + (0x05c5, 0x0000, 114, 134), # 112: p=0.031249 ( 41, 1) + (0x011a, 0x0000, 65, 59), # 113: p=0.005957 ( 2, 389) + (0x03cf, 0x0000, 116, 132), # 114: p=0.020618 ( 63, 1) + (0x01aa, 0x0000, 61, 55), # 115: p=0.009020 ( 2, 256) + (0x0285, 0x0000, 118, 130), # 116: p=0.013652 ( 96, 1) + (0x0286, 0x0000, 57, 51), # 117: p=0.013672 ( 2, 168) + (0x01ab, 0x0000, 120, 128), # 118: p=0.009029 ( 146, 1) + (0x03d3, 0x0000, 53, 47), # 119: p=0.020710 ( 2, 110) + (0x011a, 0x0000, 122, 126), # 120: p=0.005961 ( 222, 1) + (0x05c5, 0x0000, 49, 41), # 121: p=0.031250 ( 2, 72) + (0x00ba, 0x0000, 124, 62), # 122: p=0.003925 ( 338, 1) + (0x08ad, 0x0000, 43, 37), # 123: p=0.046979 ( 2, 47) + (0x007a, 0x0000, 72, 66), # 124: p=0.002586 ( 514, 1) + (0x0ccc, 0x0000, 39, 31), # 125: p=0.069306 ( 2, 31) + (0x01eb, 0x0000, 60, 54), # 126: p=0.010386 ( 222, 2) + (0x1302, 0x0000, 33, 25), # 127: p=0.102940 ( 2, 20) + (0x02e6, 0x0000, 56, 50), # 128: p=0.015695 ( 146, 2) + (0x1b81, 0x0000, 29, 131), # 129: p=0.148935 ( 2, 13) + (0x045e, 0x0000, 52, 46), # 130: p=0.023648 ( 96, 2) + (0x24ef, 0x0000, 23, 17), # 131: p=0.199999 ( 3, 13) + (0x0690, 0x0000, 48, 40), # 132: p=0.035533 ( 63, 2) + (0x2865, 0x0000, 23, 15), # 133: p=0.218748 ( 2, 8) + (0x09de, 0x0000, 42, 136), # 134: p=0.053434 ( 41, 2) + (0x3987, 0x0000, 137, 7), # 135: p=0.304346 ( 2, 5) + (0x0dc8, 0x0000, 38, 32), # 136: p=0.074626 ( 41, 3) + (0x2c99, 0x0000, 21, 139), # 137: p=0.241378 ( 2, 7) + (0x10ca, 0x0000, 140, 172), # 138: p=0.090907 ( 13, 1) + (0x3b5f, 0x0000, 15, 9), # 139: p=0.312499 ( 3, 7) + (0x0b5d, 0x0000, 142, 170), # 140: p=0.061537 ( 20, 1) + (0x5695, 0x0000, 9, 85), # 141: p=0.411764 ( 2, 3) + (0x078a, 0x0000, 144, 168), # 142: p=0.040815 ( 31, 1) + (0x8000, 0x0000, 141, 248), # 143: p=0.500000 ( 2, 2) + (0x050f, 0x0000, 146, 166), # 144: p=0.027397 ( 47, 1) + (0x24ee, 0x0000, 147, 247), # 145: p=0.199988 ( 0, 1) + (0x0358, 0x0000, 148, 164), # 146: p=0.018099 ( 72, 1) + (0x0d30, 0x0000, 149, 197), # 147: p=0.071422 ( 0, 4) + (0x0234, 0x0000, 150, 162), # 148: p=0.011940 ( 110, 1) + (0x0481, 0x0000, 151, 95), # 149: p=0.024388 ( 0, 13) + (0x0173, 0x0000, 152, 160), # 150: p=0.007858 ( 168, 1) + (0x017a, 0x0000, 153, 173), # 151: p=0.007999 ( 0, 41) + (0x00f5, 0x0000, 154, 158), # 152: p=0.005175 ( 256, 1) + (0x007b, 0x0000, 155, 165), # 153: p=0.002611 ( 0, 127) + (0x00a1, 0x0000, 70, 156), # 154: p=0.003413 ( 389, 1) + (0x0028, 0x0000, 157, 161), # 155: p=0.000849 ( 0, 392) + (0x011a, 0x0000, 66, 60), # 156: p=0.005957 ( 389, 2) + (0x000d, 0x0000, 81, 159), # 157: p=0.000276 ( 0, 1208) + (0x01aa, 0x0000, 62, 56), # 158: p=0.009020 ( 256, 2) + (0x0034, 0x0000, 75, 71), # 159: p=0.001102 ( 1, 1208) + (0x0286, 0x0000, 58, 52), # 160: p=0.013672 ( 168, 2) + (0x00a0, 0x0000, 69, 163), # 161: p=0.003387 ( 1, 392) + (0x03d3, 0x0000, 54, 48), # 162: p=0.020710 ( 110, 2) + (0x0117, 0x0000, 65, 59), # 163: p=0.005912 ( 2, 392) + (0x05c5, 0x0000, 50, 42), # 164: p=0.031250 ( 72, 2) + (0x01ea, 0x0000, 167, 171), # 165: p=0.010362 ( 1, 127) + (0x08ad, 0x0000, 44, 38), # 166: p=0.046979 ( 47, 2) + (0x0144, 0x0000, 65, 169), # 167: p=0.006849 ( 1, 193) + (0x0ccc, 0x0000, 40, 32), # 168: p=0.069306 ( 31, 2) + (0x0234, 0x0000, 59, 53), # 169: p=0.011925 ( 2, 193) + (0x1302, 0x0000, 34, 26), # 170: p=0.102940 ( 20, 2) + (0x0353, 0x0000, 55, 47), # 171: p=0.017995 ( 2, 127) + (0x1b81, 0x0000, 30, 174), # 172: p=0.148935 ( 13, 2) + (0x05c5, 0x0000, 175, 193), # 173: p=0.031249 ( 1, 41) + (0x24ef, 0x0000, 24, 18), # 174: p=0.199999 ( 13, 3) + (0x03cf, 0x0000, 177, 191), # 175: p=0.020618 ( 1, 63) + (0x2b74, 0x0000, 178, 222), # 176: p=0.235291 ( 4, 1) + (0x0285, 0x0000, 179, 189), # 177: p=0.013652 ( 1, 96) + (0x201d, 0x0000, 180, 218), # 178: p=0.173910 ( 6, 1) + (0x01ab, 0x0000, 181, 187), # 179: p=0.009029 ( 1, 146) + (0x1715, 0x0000, 182, 216), # 180: p=0.124998 ( 9, 1) + (0x011a, 0x0000, 183, 185), # 181: p=0.005961 ( 1, 222) + (0x0fb7, 0x0000, 184, 214), # 182: p=0.085105 ( 14, 1) + (0x00ba, 0x0000, 69, 61), # 183: p=0.003925 ( 1, 338) + (0x0a67, 0x0000, 186, 212), # 184: p=0.056337 ( 22, 1) + (0x01eb, 0x0000, 59, 53), # 185: p=0.010386 ( 2, 222) + (0x06e7, 0x0000, 188, 210), # 186: p=0.037382 ( 34, 1) + (0x02e6, 0x0000, 55, 49), # 187: p=0.015695 ( 2, 146) + (0x0496, 0x0000, 190, 208), # 188: p=0.024844 ( 52, 1) + (0x045e, 0x0000, 51, 45), # 189: p=0.023648 ( 2, 96) + (0x030d, 0x0000, 192, 206), # 190: p=0.016529 ( 79, 1) + (0x0690, 0x0000, 47, 39), # 191: p=0.035533 ( 2, 63) + (0x0206, 0x0000, 194, 204), # 192: p=0.010959 ( 120, 1) + (0x09de, 0x0000, 41, 195), # 193: p=0.053434 ( 2, 41) + (0x0155, 0x0000, 196, 202), # 194: p=0.007220 ( 183, 1) + (0x0dc8, 0x0000, 37, 31), # 195: p=0.074626 ( 3, 41) + (0x00e1, 0x0000, 198, 200), # 196: p=0.004750 ( 279, 1) + (0x2b74, 0x0000, 199, 243), # 197: p=0.235291 ( 1, 4) + (0x0094, 0x0000, 72, 64), # 198: p=0.003132 ( 424, 1) + (0x201d, 0x0000, 201, 239), # 199: p=0.173910 ( 1, 6) + (0x0188, 0x0000, 62, 56), # 200: p=0.008284 ( 279, 2) + (0x1715, 0x0000, 203, 237), # 201: p=0.124998 ( 1, 9) + (0x0252, 0x0000, 58, 52), # 202: p=0.012567 ( 183, 2) + (0x0fb7, 0x0000, 205, 235), # 203: p=0.085105 ( 1, 14) + (0x0383, 0x0000, 54, 48), # 204: p=0.019021 ( 120, 2) + (0x0a67, 0x0000, 207, 233), # 205: p=0.056337 ( 1, 22) + (0x0547, 0x0000, 50, 44), # 206: p=0.028571 ( 79, 2) + (0x06e7, 0x0000, 209, 231), # 207: p=0.037382 ( 1, 34) + (0x07e2, 0x0000, 46, 38), # 208: p=0.042682 ( 52, 2) + (0x0496, 0x0000, 211, 229), # 209: p=0.024844 ( 1, 52) + (0x0bc0, 0x0000, 40, 34), # 210: p=0.063636 ( 34, 2) + (0x030d, 0x0000, 213, 227), # 211: p=0.016529 ( 1, 79) + (0x1178, 0x0000, 36, 28), # 212: p=0.094593 ( 22, 2) + (0x0206, 0x0000, 215, 225), # 213: p=0.010959 ( 1, 120) + (0x19da, 0x0000, 30, 22), # 214: p=0.139999 ( 14, 2) + (0x0155, 0x0000, 217, 223), # 215: p=0.007220 ( 1, 183) + (0x24ef, 0x0000, 26, 16), # 216: p=0.199998 ( 9, 2) + (0x00e1, 0x0000, 219, 221), # 217: p=0.004750 ( 1, 279) + (0x320e, 0x0000, 20, 220), # 218: p=0.269229 ( 6, 2) + (0x0094, 0x0000, 71, 63), # 219: p=0.003132 ( 1, 424) + (0x432a, 0x0000, 14, 8), # 220: p=0.344827 ( 6, 3) + (0x0188, 0x0000, 61, 55), # 221: p=0.008284 ( 2, 279) + (0x447d, 0x0000, 14, 224), # 222: p=0.349998 ( 4, 2) + (0x0252, 0x0000, 57, 51), # 223: p=0.012567 ( 2, 183) + (0x5ece, 0x0000, 8, 2), # 224: p=0.434782 ( 4, 3) + (0x0383, 0x0000, 53, 47), # 225: p=0.019021 ( 2, 120) + (0x8000, 0x0000, 228, 87), # 226: p=0.500000 ( 1, 1) + (0x0547, 0x0000, 49, 43), # 227: p=0.028571 ( 2, 79) + (0x481a, 0x0000, 230, 246), # 228: p=0.363634 ( 2, 1) + (0x07e2, 0x0000, 45, 37), # 229: p=0.042682 ( 2, 52) + (0x3579, 0x0000, 232, 244), # 230: p=0.285711 ( 3, 1) + (0x0bc0, 0x0000, 39, 33), # 231: p=0.063636 ( 2, 34) + (0x24ef, 0x0000, 234, 238), # 232: p=0.199997 ( 5, 1) + (0x1178, 0x0000, 35, 27), # 233: p=0.094593 ( 2, 22) + (0x1978, 0x0000, 138, 236), # 234: p=0.137929 ( 8, 1) + (0x19da, 0x0000, 29, 21), # 235: p=0.139999 ( 2, 14) + (0x2865, 0x0000, 24, 16), # 236: p=0.218748 ( 8, 2) + (0x24ef, 0x0000, 25, 15), # 237: p=0.199998 ( 2, 9) + (0x3987, 0x0000, 240, 8), # 238: p=0.304346 ( 5, 2) + (0x320e, 0x0000, 19, 241), # 239: p=0.269229 ( 2, 6) + (0x2c99, 0x0000, 22, 242), # 240: p=0.241378 ( 7, 2) + (0x432a, 0x0000, 13, 7), # 241: p=0.344827 ( 3, 6) + (0x3b5f, 0x0000, 16, 10), # 242: p=0.312499 ( 7, 3) + (0x447d, 0x0000, 13, 245), # 243: p=0.349998 ( 2, 4) + (0x5695, 0x0000, 10, 2), # 244: p=0.411764 ( 3, 2) + (0x5ece, 0x0000, 7, 1), # 245: p=0.434782 ( 3, 4) + (0x8000, 0x0000, 244, 83), # 246: p=0.500000 ( 2, 2) + (0x8000, 0x0000, 249, 250), # 247: p=0.500000 ( 1, 1) + (0x5695, 0x0000, 10, 2), # 248: p=0.411764 ( 3, 2) + (0x481a, 0x0000, 89, 143), # 249: p=0.363634 ( 1, 2) + (0x481a, 0x0000, 230, 246), # 250: p=0.363634 ( 2, 1) + (0, 0, 0, 0), + (0, 0, 0, 0), + (0, 0, 0, 0), + (0, 0, 0, 0), + (0, 0, 0, 0), +] + + +xmtf = ( + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, + 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, + 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, + 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF +) + +class BZZDecoder(): + def __init__(self, infile, outfile): + self.instream = infile + self.outf = outfile + self.ieof = False + self.bptr = None + self.xsize = None + self.outbuf = [0] * (MAXBLOCK * 1024) + self.byte = None + self.scount = 0 + self.delay = 25 + self.a = 0 + self.code = 0 + self.bufint = 0 + self.ctx = [0] * 300 + # table + self.p = [0] * 256 + self.m = [0] * 256 + self.up = [0] * 256 + self.dn = [0] * 256 + # machine independent ffz + self.ffzt = [0] * 256 + + # Create machine independent ffz table + for i in range(256): + j = i + while(j & 0x80): + self.ffzt[i] += 1 + j <<= 1 + # Initialize table + self.newtable(default_ztable) + # Codebit counter + # Read first 16 bits of code + if not self.read_byte(): + self.byte = 0xff + self.code = (self.byte << 8) + if not self.read_byte(): + self.byte = 0xff + self.code = self.code | self.byte + # Preload buffer + self.preload() + # Compute initial fence + self.fence = self.code + if self.code >= 0x8000: + self.fence = 0x7fff + + def convert(self, sz): + if self.ieof: + return 0 + copied = 0 + while sz > 0 and not (self.ieof): + # Decode if needed + if not self.xsize: + self.bptr = 0 + if not self.decode(): # input block size set in decode + self.xsize = 1 + self.ieof = True + self.xsize -= 1 + + # Compute remaining + bytes = self.xsize + if bytes > sz: + bytes = sz + # Transfer + if bytes: + for i in range(bytes): + self.outf.write(chr(self.outbuf[self.bptr + i])) + self.xsize -= bytes + self.bptr += bytes + sz -= bytes + copied += bytes + # offset += bytes; // for tell() + return copied + + def preload(self): + while self.scount <= 24: + if self.read_byte() < 1: + self.byte = 0xff + if --self.delay < 1: + raise BZZDecoderError("BiteStream EOF") + self.bufint = (self.bufint << 8) | self.byte + self.scount += 8 + + def newtable(self, table): + for i in range(256): + self.p[i] = table[i][0] + self.m[i] = table[i][1] + self.up[i] = table[i][2] + self.dn[i] = table[i][3] + + def decode(self): + outbuf = self.outbuf + # Decode block size + self.xsize = self.decode_raw(24) + if not self.xsize: + return 0 + if self.xsize > MAXBLOCK * 1024: # 4MB (4096 * 1024) is max block + raise BZZDecoderError("BiteStream.corrupt") + # Dec11ode Estimation Speed + fshift = 0 + if self.zpcodec_decoder(): + fshift += 1 + if self.zpcodec_decoder(): + fshift += 1 + # Prepare Quasi MTF + mtf = list(xmtf) # unsigned chars + freq = [0] * FREQMAX + fadd = 4 + # Decode + mtfno = 3 + markerpos = -1 + for i in range(self.xsize): + ctxid = CTXIDS - 1 + if ctxid > mtfno: + ctxid = mtfno + cx = self.ctx + if self.zpcodec_decode(cx, ctxid): + mtfno = 0 + outbuf[i] = mtf[mtfno] + elif self.zpcodec_decode(cx, ctxid + CTXIDS): + mtfno = 1 + outbuf[i] = mtf[mtfno] + elif self.zpcodec_decode(cx, 2*CTXIDS): + mtfno = 2 + self.decode_binary(cx, 2*CTXIDS + 1, 1) + outbuf[i] = mtf[mtfno] + elif self.zpcodec_decode(cx, 2*CTXIDS+2): + mtfno = 4 + self.decode_binary(cx, 2*CTXIDS+2 + 1, 2) + outbuf[i] = mtf[mtfno] + elif self.zpcodec_decode(cx, 2*CTXIDS + 6): + mtfno = 8 + self.decode_binary(cx, 2*CTXIDS + 6 + 1, 3) + outbuf[i] = mtf[mtfno] + elif self.zpcodec_decode(cx, 2*CTXIDS + 14): + mtfno = 16 + self.decode_binary(cx, 2*CTXIDS + 14 + 1, 4) + outbuf[i] = mtf[mtfno] + elif self.zpcodec_decode(cx, 2*CTXIDS + 30 ): + mtfno = 32 + self.decode_binary(cx, 2*CTXIDS + 30 + 1, 5) + outbuf[i] = mtf[mtfno] + elif self.zpcodec_decode(cx, 2*CTXIDS + 62 ): + mtfno = 64 + self.decode_binary(cx, 2*CTXIDS + 62 + 1, 6) + outbuf[i] = mtf[mtfno] + elif self.zpcodec_decode(cx, 2*CTXIDS + 126): + mtfno = 128 + self.decode_binary(cx, 2*CTXIDS + 126 + 1, 7) + outbuf[i] = mtf[mtfno] + else: + mtfno = 256 # EOB + outbuf[i] = 0 + markerpos = i + continue + + # Rotate mtf according to empirical frequencies (new!) + # :rotate label + # Adjust frequencies for overflow + fadd = fadd + (fadd >> fshift) + if fadd > 0x10000000: + fadd >>= 24 + freq[0] >>= 24 + freq[1] >>= 24 + freq[2] >>= 24 + freq[3] >>= 24 + for k in range(4, FREQMAX): + freq[k] = freq[k] >> 24 + # Relocate new char according to new freq + fc = fadd + if mtfno < FREQMAX: + fc += freq[mtfno] + k = mtfno + while (k >= FREQMAX): + mtf[k] = mtf[k - 1] + k -= 1 + while (k > 0 and fc >= freq[k - 1]): + mtf[k] = mtf[k - 1] + freq[k] = freq[k - 1] + k -= 1 + mtf[k] = outbuf[i] + freq[k] = fc + #/////////////////////////////// + #//////// Reconstruct the string + + if markerpos < 1 or markerpos >= self.xsize: + raise BZZDecoderError("BiteStream.corrupt") + # Allocate pointers + posn = [0] * self.xsize + # Prepare count buffer + count = [0] * 256 + # Fill count buffer + for i in range(markerpos): + c = outbuf[i] + posn[i] = (c << 24) | (count[c] & 0xffffff) + count[c] += 1 + for i in range(markerpos + 1, self.xsize): + c = outbuf[i] + posn[i] = (c << 24) | (count[c] & 0xffffff) + count[c] += 1 + # Compute sorted char positions + last = 1 + for i in range(256): + tmp = count[i] + count[i] = last + last += tmp + # Undo the sort transform + i = 0 + last = self.xsize - 1 + while last > 0: + n = posn[i] + c = (posn[i] >> 24) + last -= 1 + outbuf[last] = c + i = count[c] + (n & 0xffffff) + # Free and check + if i != markerpos: + raise BZZDecoderError("BiteStream.corrupt") + return self.xsize + + def decode_raw(self, bits): + n = 1 + m = (1 << bits) + while n < m: + b = self.zpcodec_decoder() + n = (n << 1) | b + return n - m + + def decode_binary(self, ctx, index, bits): + n = 1 + m = (1 << bits) + while n < m: + b = self.zpcodec_decode(ctx, index + n - 1) + n = (n << 1) | b + return n - m + + def zpcodec_decoder(self): + return self.decode_sub_simple(0, 0x8000 + (self.a >> 1)) + + def decode_sub_simple(self, mps, z): + # Test MPS/LPS + if z > self.code: + # LPS branch + z = 0x10000 - z + self.a += +z + self.code = self.code + z + # LPS renormalization + shift = self.ffz() + self.scount -= shift + self.a = self.a << shift + self.a &= 0xffff + self.code = (self.code << shift) | ((self.bufint >> self.scount) & ((1 << shift) - 1)) + self.code &= 0xffff + if self.scount < 16: + self.preload() + # Adjust fence + self.fence = self.code + if self.code >= 0x8000: + self.fence = 0x7fff + result = mps ^ 1 + else: + # MPS renormalization + self.scount -= 1 + self.a = (z << 1) & 0xffff + self.code = ((self.code << 1) | ((self.bufint >> self.scount) & 1)) + self.code &= 0xffff + if self.scount < 16: + self.preload() + # Adjust fence + self.fence = self.code + if self.code >= 0x8000: + self.fence = 0x7fff + result = mps + return result + + def decode_sub(self, ctx, index, z): + # Save bit + bit = (ctx[index] & 1) + # Avoid interval reversion + d = 0x6000 + ((z + self.a) >> 2) + if z > d: + z = d + # Test MPS/LPS + if z > self.code: + # LPS branch + z = 0x10000 - z + self.a += +z + self.code = self.code + z + # LPS adaptation + ctx[index] = self.dn[ctx[index]] + # LPS renormalization + shift = self.ffz() + self.scount -= shift + self.a = (self.a << shift) & 0xffff + self.code = ((self.code << shift) | ((self.bufint >> self.scount) & ((1 << shift) - 1))) & 0xffff + if self.scount < 16: + self.preload() + # Adjust fence + self.fence = self.code + if self.code >= 0x8000: + self.fence = 0x7fff + return bit ^ 1 + else: + # MPS adaptation + if self.a >= self.m[ctx[index]]: + ctx[index] = self.up[ctx[index]] + # MPS renormalization + self.scount -= 1 + self.a = z << 1 & 0xffff + self.code = ((self.code << 1) | ((self.bufint >> self.scount) & 1)) & 0xffff + if self.scount < 16: + self.preload() + # Adjust fence + self.fence = self.code + if self.code >= 0x8000: + self.fence = 0x7fff + return bit + + def zpcodec_decode(self, ctx, index): + z = self.a + self.p[ctx[index]] + if z <= self.fence: + self.a = z + res = (ctx[index] & 1) + else: + res = self.decode_sub(ctx, index, z) + return res + + def read_byte(self): + res = 0 + if self.instream: + ires = self.instream.read(1) + res = len(ires) + if res: + self.byte = ord(ires[0]) + else: + raise NotImplementedError + return res + + def ffz(self): + x = self.a + if (x >= 0xff00): + return (self.ffzt[x & 0xff] + 8) + else: + return (self.ffzt[(x >> 8) & 0xff]) + + + +### for testing + +def main(): + import sys + infile = file(sys.argv[1], "rb") + outfile = file(sys.argv[2], "wb") + dec = BZZDecoder(infile, outfile) + while True: + res = dec.convert(1024 * 1024) + if not res: + break + +if __name__ == "__main__": + main() diff --git a/src/calibre/ebooks/djvu/input.py b/src/calibre/ebooks/djvu/input.py new file mode 100644 index 0000000000..0bae302568 --- /dev/null +++ b/src/calibre/ebooks/djvu/input.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2011, Anthon van der Neut ' +__docformat__ = 'restructuredtext en' + +import os +from subprocess import Popen, PIPE, call +from cStringIO import StringIO + +from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.txt.processor import convert_basic + +class DJVUInput(InputFormatPlugin): + + name = 'DJVU Input' + author = 'Anthon van der Neut' + description = 'Convert OCR-ed DJVU files (.djvu) to HTML' + file_types = set(['djvu']) + output_encoding = None + + recommendations = set( + # [('page_breaks_before', '/', OptionRecommendation.MED)] + ) + + + def convert(self, stream, options, file_ext, log, accelerators): + stdout = StringIO() + ppdjvu = True + # using djvutxt is MUCH faster, should make it an option + if False and os.path.exists('/usr/bin/djvutxt'): + from calibre.ptempfile import PersistentTemporaryFile + try: + fp = PersistentTemporaryFile(suffix='.djvu', prefix='calibre') + filename = fp._name + fp.write(stream.read()) + fp.close() + cmd = ['djvutxt', filename] + stdout.write(Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0]) + os.remove(filename) + ppdjvu = False + except: + stream.seek(0) # retry with the pure python converter + if ppdjvu: + from .djvu import DJVUFile + x = DJVUFile(stream) + x.get_text(stdout) + + html = convert_basic(stdout.getvalue().replace("\n", ' ').replace('\037', '\n\n')) + # Run the HTMLized text through the html processing plugin. + from calibre.customize.ui import plugin_for_input_format + html_input = plugin_for_input_format('html') + for opt in html_input.options: + setattr(options, opt.option.name, opt.recommended_value) + options.input_encoding = 'utf-8' + base = os.getcwdu() + if file_ext != 'txtz' and hasattr(stream, 'name'): + base = os.path.dirname(stream.name) + fname = os.path.join(base, 'index.html') + c = 0 + while os.path.exists(fname): + c += 1 + fname = 'index%d.html'%c + htmlfile = open(fname, 'wb') + with htmlfile: + htmlfile.write(html.encode('utf-8')) + odi = options.debug_pipeline + options.debug_pipeline = None + # Generate oeb from html conversion. + oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, + {}) + options.debug_pipeline = odi + os.remove(htmlfile.name) + + # Set metadata from file. + from calibre.customize.ui import get_file_type_metadata + from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata + mi = get_file_type_metadata(stream, file_ext) + meta_info_to_oeb_metadata(mi, oeb.metadata, log) + + return oeb + From bfade8c9110c2ca8b1141614b735c83ade1ea9e5 Mon Sep 17 00:00:00 2001 From: Anthon Date: Tue, 4 Oct 2011 12:36:08 +0200 Subject: [PATCH 04/58] splitted dump text and structure --- src/calibre/ebooks/djvu/djvu.py | 44 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/djvu/djvu.py b/src/calibre/ebooks/djvu/djvu.py index 8a9282f83e..911d839e4d 100644 --- a/src/calibre/ebooks/djvu/djvu.py +++ b/src/calibre/ebooks/djvu/djvu.py @@ -57,13 +57,11 @@ class DjvuChunk(object): if verbose > 0: print ' end of chunk %d (%x)' % (pos, pos) - def dump(self, verbose=0, indent=1, out=None): - if out is None: - out = sys.stdout - if verbose > 0: + def dump(self, verbose=0, indent=1, out=None, txtout=None, maxlevel=100): + if out: out.write(' ' * indent) out.write('%s%s [%d]\n' % (self.type, ':' + self.subtype if self.subtype else '', self.size)) - if self.type == 'TXTz': + if txtout and self.type == 'TXTz': inbuf = StringIO(self.buf[self.datastart: self.dataend]) outbuf = StringIO() decoder = BZZDecoder(inbuf, outbuf) @@ -76,33 +74,38 @@ class DjvuChunk(object): for x in res[:3]: l <<= 8 l += ord(x) - if verbose > 0: + if verbose > 0 and out: print >> out, l - out.write(res[3:3+l]) - out.write('\n\f') - if self.type == 'TXTa': + txtout.write(res[3:3+l]) + txtout.write('\n\f') + if txtout and self.type == 'TXTa': res = self.buf[self.datastart: self.dataend] l = 0 for x in res[:3]: l <<= 8 l += ord(x) - if verbose > 0: + if verbose > 0 and out: print >> out, l - out.write(res[3:3+l]) - out.write('\n\f') + txtout.write(res[3:3+l]) + txtout.write('\n\f') + if indent >= maxlevel: + return for schunk in self._subchunks: - schunk.dump(verbose=verbose, indent=indent+1, out=out) + schunk.dump(verbose=verbose, indent=indent+1, out=out, txtout=txtout) class DJVUFile(object): - def __init__(self, instream): + def __init__(self, instream, verbose=0): self.instream = instream buf = self.instream.read(4) assert(buf == 'AT&T') buf = self.instream.read() - self.dc = DjvuChunk(buf, 0, len(buf)) + self.dc = DjvuChunk(buf, 0, len(buf), verbose=verbose) def get_text(self, outfile=None): - self.dc.dump(out=outfile) + self.dc.dump(txtout=outfile) + + def dump(self, outfile=None, maxlevel=0): + self.dc.dump(out=outfile, maxlevel=maxlevel) def main(): from ruamel.util.program import Program, CountAction @@ -117,13 +120,18 @@ def main(): #self._argparser.add_argument('--segments', '-s', action='append', nargs='+') #self._argparser.add_argument('--force', '-f', action='store_true') #self._argparser.add_argument('classname') + self._argparser.add_argument('--text', '-t', action='store_true') + self._argparser.add_argument('--dump', type=int, default=0) self._argparser.add_argument('file', nargs='+') def run(self): if self._args.verbose > 1: # can be negative with --quiet print self._args.file - x = DJVUFile(file(self._args.file[0], 'rb')) - x.get_text() + x = DJVUFile(file(self._args.file[0], 'rb'), verbose=self._args.verbose) + if self._args.text: + print x.get_text(sys.stdout) + if self._args.dump: + x.dump(sys.stdout, maxlevel=self._args.dump) return 0 tt = DJVUDecoder() From 8c5195cbcc4c5e20ae31664975e233cec29a24a8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 4 Oct 2011 10:07:26 -0600 Subject: [PATCH 05/58] Fix #865452 (Houston Chronicle news misses last section's stories) --- recipes/houston_chronicle.recipe | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/houston_chronicle.recipe b/recipes/houston_chronicle.recipe index 8d231dac16..b8171467ec 100644 --- a/recipes/houston_chronicle.recipe +++ b/recipes/houston_chronicle.recipe @@ -18,6 +18,7 @@ class HoustonChronicle(BasicNewsRecipe): keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or 'hst-articletext' in x or 'hst-galleryitem' in x)} + remove_attributes = ['xmlns'] feeds = [ ('News', "http://www.chron.com/rss/feed/News-270.php"), From ef0036ce4fd8f7774237df56c52547629839ab58 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 4 Oct 2011 10:51:12 -0600 Subject: [PATCH 06/58] Fix FAZ.net --- recipes/faznet.recipe | 51 +++++++++++-------------------------------- 1 file changed, 13 insertions(+), 38 deletions(-) diff --git a/recipes/faznet.recipe b/recipes/faznet.recipe index 742c6f4431..720993cfbb 100644 --- a/recipes/faznet.recipe +++ b/recipes/faznet.recipe @@ -19,45 +19,20 @@ class FazNet(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' remove_javascript = True - - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - - keep_only_tags = [dict(name='div', attrs={'class':'Article'})] - - remove_tags = [ - dict(name=['object','link','embed','base']) - ,dict(name='div', - attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo', - 'ArtikelServices', 'ModulLesermeinungenFooter', - 'ModulArtikelServices', 'BoxTool Aufklappen_Grau', - 'SocialMediaUnten', ]}), - dict(id=['KurzLinkMenu', 'ArtikelServicesMenu']), - ] + auto_cleanup = True + auto_cleanup_keep_tags = '//div[@class="FAZArtikelEinleitung"]' feeds = [ - ('FAZ.NET Aktuell', 'http://www.faz.net/s/RubF3CE08B362D244869BE7984590CB6AC1/Tpl~Epartner~SRss_.xml'), - ('Politik', 'http://www.faz.net/s/RubA24ECD630CAE40E483841DB7D16F4211/Tpl~Epartner~SRss_.xml'), - ('Wirtschaft', 'http://www.faz.net/s/RubC9401175958F4DE28E143E68888825F6/Tpl~Epartner~SRss_.xml'), - ('Feuilleton', 'http://www.faz.net/s/RubCC21B04EE95145B3AC877C874FB1B611/Tpl~Epartner~SRss_.xml'), - ('Sport', 'http://www.faz.net/s/Rub9F27A221597D4C39A82856B0FE79F051/Tpl~Epartner~SRss_.xml'), - ('Gesellschaft', 'http://www.faz.net/s/Rub02DBAA63F9EB43CEB421272A670A685C/Tpl~Epartner~SRss_.xml'), - ('Finanzen', 'http://www.faz.net/s/Rub4B891837ECD14082816D9E088A2D7CB4/Tpl~Epartner~SRss_.xml'), - ('Wissen', 'http://www.faz.net/s/Rub7F4BEE0E0C39429A8565089709B70C44/Tpl~Epartner~SRss_.xml'), - ('Reise', 'http://www.faz.net/s/RubE2FB5CA667054BDEA70FB3BC45F8D91C/Tpl~Epartner~SRss_.xml'), - ('Technik & Motor', 'http://www.faz.net/s/Rub01E4D53776494844A85FDF23F5707AD8/Tpl~Epartner~SRss_.xml'), - ('Beruf & Chance', 'http://www.faz.net/s/RubB1E10A8367E8446897468EDAA6EA0504/Tpl~Epartner~SRss_.xml') + ('FAZ.NET Aktuell', 'http://www.faz.net/aktuell/?rssview=1'), + ('Politik', 'http://www.faz.net/aktuell/politik/?rssview=1'), + ('Wirtschaft', 'http://www.faz.net/aktuell/wirtschaft/?rssview=1'), + ('Feuilleton', 'http://www.faz.net/aktuell/feuilleton/?rssview=1'), + ('Sport', 'http://www.faz.net/aktuell/sport/?rssview=1'), + ('Gesellschaft', 'http://www.faz.net/aktuell/gesellschaft/?rssview=1'), + ('Finanzen', 'http://www.faz.net/aktuell/finanzen/?rssview=1'), + ('Technik & Motor', 'http://www.faz.net/aktuell/technik-motor/?rssview=1'), + ('Wissen', 'http://www.faz.net/aktuell/wissen/?rssview=1'), + ('Reise', 'http://www.faz.net/aktuell/reise/?rssview=1'), + ('Beruf & Chance', 'http://www.faz.net/aktuell/beruf-chance/?rssview=1') ] - def preprocess_html(self, soup): - mtag = '' - soup.head.insert(0,mtag) - del soup.body['onload'] - for item in soup.findAll(style=True): - del item['style'] - return soup From 0b3488319ef9c5dbbd2a57728425a9d13082d653 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 4 Oct 2011 10:59:37 -0600 Subject: [PATCH 07/58] ... --- recipes/faznet.recipe | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/faznet.recipe b/recipes/faznet.recipe index 720993cfbb..4f65c755c0 100644 --- a/recipes/faznet.recipe +++ b/recipes/faznet.recipe @@ -19,8 +19,8 @@ class FazNet(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' remove_javascript = True - auto_cleanup = True - auto_cleanup_keep_tags = '//div[@class="FAZArtikelEinleitung"]' + keep_only_tags = [{'class':'FAZArtikelEinleitung'}, + {'id':'ArtikelTabContent_0'}] feeds = [ ('FAZ.NET Aktuell', 'http://www.faz.net/aktuell/?rssview=1'), From cebaee7f2373206d2d261babbc3082e64f358891 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 4 Oct 2011 11:04:08 -0600 Subject: [PATCH 08/58] ... --- src/calibre/devices/interface.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py index 507a836e37..4877cd359e 100644 --- a/src/calibre/devices/interface.py +++ b/src/calibre/devices/interface.py @@ -414,7 +414,8 @@ class DevicePlugin(Plugin): @classmethod def config_widget(cls): ''' - Should return a QWidget. The QWidget contains the settings for the device interface + Should return a QWidget. The QWidget contains the settings for the + device interface ''' raise NotImplementedError() @@ -429,8 +430,9 @@ class DevicePlugin(Plugin): @classmethod def settings(cls): ''' - Should return an opts object. The opts object should have at least one attribute - `format_map` which is an ordered list of formats for the device. + Should return an opts object. The opts object should have at least one + attribute `format_map` which is an ordered list of formats for the + device. ''' raise NotImplementedError() From 1e9a8643eb6d28046a24f553c27f9b482414c27e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 4 Oct 2011 11:21:32 -0600 Subject: [PATCH 09/58] Fix #855055 ("Back" button on download metadata dialog box) --- src/calibre/gui2/metadata/single_download.py | 37 +++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/metadata/single_download.py b/src/calibre/gui2/metadata/single_download.py index 3ede8f4b61..255abe3999 100644 --- a/src/calibre/gui2/metadata/single_download.py +++ b/src/calibre/gui2/metadata/single_download.py @@ -538,14 +538,20 @@ class CoversModel(QAbstractListModel): # {{{ current_cover = QPixmap(I('default_cover.png')) self.blank = QPixmap(I('blank.png')).scaled(150, 200) + self.cc = current_cover + self.reset_covers(do_reset=False) - self.covers = [self.get_item(_('Current cover'), current_cover)] + def reset_covers(self, do_reset=True): + self.covers = [self.get_item(_('Current cover'), self.cc)] self.plugin_map = {} for i, plugin in enumerate(metadata_plugins(['cover'])): self.covers.append((plugin.name+'\n'+_('Searching...'), QVariant(self.blank), None, True)) self.plugin_map[plugin] = i+1 + if do_reset: + self.reset() + def get_item(self, src, pmap, waiting=False): sz = '%dx%d'%(pmap.width(), pmap.height()) text = QVariant(src + '\n' + sz) @@ -654,6 +660,9 @@ class CoversView(QListView): # {{{ self.select(0) self.delegate.start_animation() + def reset_covers(self): + self.m.reset_covers() + def clear_failed(self): plugin = self.m.plugin_for_index(self.currentIndex()) self.m.clear_failed() @@ -683,12 +692,18 @@ class CoversWidget(QWidget): # {{{ l.addWidget(self.covers_view, 1, 0) self.continue_processing = True + def reset_covers(self): + self.covers_view.reset_covers() + def start(self, book, current_cover, title, authors): + self.continue_processing = True + self.abort.clear() self.book, self.current_cover = book, current_cover self.title, self.authors = title, authors self.log('Starting cover download for:', book.title) self.log('Query:', title, authors, self.book.identifiers) - self.msg.setText('

'+_('Downloading covers for %s, please wait...')%book.title) + self.msg.setText('

'+ + _('Downloading covers for %s, please wait...')%book.title) self.covers_view.start() self.worker = CoverWorker(self.log, self.abort, self.title, @@ -726,8 +741,9 @@ class CoversWidget(QWidget): # {{{ if num < 2: txt = _('Could not find any covers for %s')%self.book.title else: - txt = _('Found %(num)d covers of %(title)s. Pick the one you like' - ' best.')%dict(num=num-1, title=self.title) + txt = _('Found %(num)d covers of %(title)s. ' + 'Pick the one you like best.')%dict(num=num-1, + title=self.title) self.msg.setText(txt) self.finished.emit() @@ -832,10 +848,14 @@ class FullFetch(QDialog): # {{{ self.next_button.clicked.connect(self.next_clicked) self.ok_button = self.bb.button(self.bb.Ok) self.ok_button.clicked.connect(self.ok_clicked) + self.prev_button = self.bb.addButton(_('Back'), self.bb.ActionRole) + self.prev_button.setIcon(QIcon(I('back.png'))) + self.prev_button.clicked.connect(self.back_clicked) self.log_button = self.bb.addButton(_('View log'), self.bb.ActionRole) self.log_button.clicked.connect(self.view_log) self.log_button.setIcon(QIcon(I('debug.png'))) self.ok_button.setVisible(False) + self.prev_button.setVisible(False) self.identify_widget = IdentifyWidget(self.log, self) self.identify_widget.rejected.connect(self.reject) @@ -857,12 +877,21 @@ class FullFetch(QDialog): # {{{ def book_selected(self, book): self.next_button.setVisible(False) self.ok_button.setVisible(True) + self.prev_button.setVisible(True) self.book = book self.stack.setCurrentIndex(1) self.log('\n\n') self.covers_widget.start(book, self.current_cover, self.title, self.authors) + def back_clicked(self): + self.next_button.setVisible(True) + self.ok_button.setVisible(False) + self.prev_button.setVisible(False) + self.stack.setCurrentIndex(0) + self.covers_widget.cancel() + self.covers_widget.reset_covers() + def accept(self): # Prevent the usual dialog accept mechanisms from working pass From b8cff2585be53c1be0022c338b20aeed76625d16 Mon Sep 17 00:00:00 2001 From: Anthon Date: Tue, 4 Oct 2011 23:53:15 +0200 Subject: [PATCH 10/58] added ui where you can suppress searching for the djvutxt utility (if it is not available the pure python txt extractor is used as fallback anyway) --- src/calibre/ebooks/djvu/input.py | 16 ++++++++++----- src/calibre/gui2/convert/__init__.py | 1 + src/calibre/gui2/convert/djvu_input.py | 22 ++++++++++++++++++++ src/calibre/gui2/convert/djvu_input.ui | 28 ++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 src/calibre/gui2/convert/djvu_input.py create mode 100644 src/calibre/gui2/convert/djvu_input.ui diff --git a/src/calibre/ebooks/djvu/input.py b/src/calibre/ebooks/djvu/input.py index 0bae302568..6f2f57f377 100644 --- a/src/calibre/ebooks/djvu/input.py +++ b/src/calibre/ebooks/djvu/input.py @@ -18,18 +18,24 @@ class DJVUInput(InputFormatPlugin): author = 'Anthon van der Neut' description = 'Convert OCR-ed DJVU files (.djvu) to HTML' file_types = set(['djvu']) - output_encoding = None + #output_encoding = None - recommendations = set( - # [('page_breaks_before', '/', OptionRecommendation.MED)] - ) + options = set([ + OptionRecommendation(name='use_djvutxt', recommended_value=True, + help=_('try to use the djvutxt program and fall back to pure python implementation if it fails or is not available')), + ]) + + #recommendations = set( + # [('try djvutxt', '/', OptionRecommendation.MED)] + #) def convert(self, stream, options, file_ext, log, accelerators): stdout = StringIO() ppdjvu = True + log.debug('options: %d' % (options.use_djvutxt)) # using djvutxt is MUCH faster, should make it an option - if False and os.path.exists('/usr/bin/djvutxt'): + if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'): from calibre.ptempfile import PersistentTemporaryFile try: fp = PersistentTemporaryFile(suffix='.djvu', prefix='calibre') diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py index bdcf9ede05..98ddd37250 100644 --- a/src/calibre/gui2/convert/__init__.py +++ b/src/calibre/gui2/convert/__init__.py @@ -56,6 +56,7 @@ class Widget(QWidget): self._icon = QIcon(self.ICON) for name in self._options: if not hasattr(self, 'opt_'+name): + print dir(self) raise Exception('Option %s missing in %s'%(name, self.__class__.__name__)) self.connect_gui_obj(getattr(self, 'opt_'+name)) diff --git a/src/calibre/gui2/convert/djvu_input.py b/src/calibre/gui2/convert/djvu_input.py new file mode 100644 index 0000000000..c5e148e894 --- /dev/null +++ b/src/calibre/gui2/convert/djvu_input.py @@ -0,0 +1,22 @@ +# coding: utf-8 + +__license__ = 'GPL v3' +__copyright__ = '2011, Anthon van der Neut ' + + +from calibre.gui2.convert.djvu_input_ui import Ui_Form +from calibre.gui2.convert import Widget, QDoubleSpinBox + +class PluginWidget(Widget, Ui_Form): + + TITLE = _('DJVU Input') + HELP = _('Options specific to')+' DJVU '+_('input') + COMMIT_NAME = 'djvu_input' + ICON = I('mimetypes/djvu.png') + + def __init__(self, parent, get_option, get_help, db=None, book_id=None): + Widget.__init__(self, parent, + ['use_djvutxt', ]) + self.db, self.book_id = db, book_id + self.initialize_options(get_option, get_help, db, book_id) + diff --git a/src/calibre/gui2/convert/djvu_input.ui b/src/calibre/gui2/convert/djvu_input.ui new file mode 100644 index 0000000000..94d5851d4f --- /dev/null +++ b/src/calibre/gui2/convert/djvu_input.ui @@ -0,0 +1,28 @@ + + + Form + + + + 0 + 0 + 400 + 300 + + + + Form + + + + + + Use &djvutxt + + + + + + + + From 2389b6fd7858028cdaacaec53e795a57f9375d03 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Wed, 5 Oct 2011 06:48:38 +0200 Subject: [PATCH 11/58] Fix plugboard problem where customizations to formats accepted by a device were ignored. --- src/calibre/gui2/preferences/plugboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/preferences/plugboard.py b/src/calibre/gui2/preferences/plugboard.py index cc94b5e42a..587db0fd79 100644 --- a/src/calibre/gui2/preferences/plugboard.py +++ b/src/calibre/gui2/preferences/plugboard.py @@ -58,7 +58,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.device_to_formats_map = {} for device in device_plugins(): n = device_name_for_plugboards(device) - self.device_to_formats_map[n] = set(device.FORMATS) + self.device_to_formats_map[n] = set(device.settings().format_map) if getattr(device, 'CAN_DO_DEVICE_DB_PLUGBOARD', False): self.device_to_formats_map[n].add('device_db') if n not in self.devices: From d1930a1ea0da117eab544c88cc67f41ba96b65ab Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 5 Oct 2011 09:58:56 -0600 Subject: [PATCH 12/58] Handle https proxies specified with a http:// URL --- src/calibre/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 358a7ee4bf..7790e91bea 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -278,6 +278,8 @@ def get_proxies(debug=True): continue if proxy.startswith(key+'://'): proxy = proxy[len(key)+3:] + if key == 'https' and proxy.startswith('http://'): + proxy = proxy[7:] if proxy.endswith('/'): proxy = proxy[:-1] if len(proxy) > 4: From c7854de342dd2b9a1605f36d07616814feacb7b0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 5 Oct 2011 09:59:51 -0600 Subject: [PATCH 13/58] Update heise online --- recipes/heise_online.recipe | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/recipes/heise_online.recipe b/recipes/heise_online.recipe index f83ff8126b..338b54782c 100644 --- a/recipes/heise_online.recipe +++ b/recipes/heise_online.recipe @@ -1,7 +1,9 @@ from calibre.web.feeds.news import BasicNewsRecipe +import re + class AdvancedUserRecipe(BasicNewsRecipe): - title = 'Heise-online' + title = 'heise online' description = 'News vom Heise-Verlag' __author__ = 'schuster' use_embedded_content = False @@ -12,10 +14,11 @@ class AdvancedUserRecipe(BasicNewsRecipe): remove_empty_feeds = True timeout = 5 no_stylesheets = True + encoding = 'utf-8' remove_tags_after = dict(name ='p', attrs={'class':'editor'}) - remove_tags = [dict(id='navi_top_container'), + remove_tags = [{'class':'navi_top_container'}, dict(id='navi_bottom'), dict(id='mitte_rechts'), dict(id='navigation'), @@ -25,28 +28,28 @@ class AdvancedUserRecipe(BasicNewsRecipe): dict(id='content_foren'), dict(id='seiten_navi'), dict(id='adbottom'), - dict(id='sitemap')] + dict(id='sitemap'), + dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')), + ] feeds = [ ('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'), - ('Auto', 'http://www.heise.de/autos/rss/news.rdf'), - ('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'), - ('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'), - ('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'), - ('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'), - ('Open ', 'http://www.heise.de/open/news/news-atom.xml'), - ('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'), - ('Security ', 'http://www.heise.de/security/news/news-atom.xml'), - ('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'), ('iX', 'http://www.heise.de/ix/news/news.rdf'), - ('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'), + ('Technology Review', 'http://www.heise.de/tr/news-atom.xml'), + ('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'), + ('Security', 'http://www.heise.de/security/news/news-atom.xml'), + ('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'), + ('Open Source', 'http://www.heise.de/open/news/news-atom.xml'), + ('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'), + ('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'), + ('Autos', 'http://www.heise.de/autos/rss/news.rdf'), + ('Mac & i', 'http://www.heise.de/mac-and-i/news.rdf'), ('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'), ('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'), ('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'), - ('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'), + ('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'), ('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf') -] + ] def print_version(self, url): return url + '?view=print' - From ec9414433b9b14a68e6005d2db65a916e65d4924 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 5 Oct 2011 11:26:59 -0600 Subject: [PATCH 14/58] SONY-T1 scan all dirs in main memory for ebooks --- src/calibre/devices/prs505/driver.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index 097f42f23e..cbd71d0823 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -329,4 +329,10 @@ class PRST1(USBMS): return __appname__ return self.EBOOK_DIR_CARD_A + def get_main_ebook_dir(self, for_upload=False): + if for_upload: + return __appname__ + return '' + + From 90e250c0329fb9a498b8dd25431c4dec4e0782e0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 5 Oct 2011 16:25:41 -0600 Subject: [PATCH 15/58] ... --- src/calibre/devices/prs505/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index cbd71d0823..df436bfd9f 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -319,7 +319,7 @@ class PRST1(USBMS): THUMBNAIL_HEIGHT = 217 SCAN_FROM_ROOT = True EBOOK_DIR_MAIN = __appname__ - + SUPPORTS_SUB_DIRS = True def windows_filter_pnp_id(self, pnp_id): return '_LAUNCHER' in pnp_id or '_SETTING' in pnp_id From b13a2a08a3b406b08870b68d2d96cd2daba31884 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Oct 2011 07:28:27 +0530 Subject: [PATCH 16/58] Improve Metro UK --- recipes/metro_uk.recipe | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/recipes/metro_uk.recipe b/recipes/metro_uk.recipe index 287af47f5c..647e5633e7 100644 --- a/recipes/metro_uk.recipe +++ b/recipes/metro_uk.recipe @@ -5,30 +5,46 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): description = 'News as provide by The Metro -UK' __author__ = 'Dave Asbury' + cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg' + no_stylesheets = True oldest_article = 1 - max_articles_per_feed = 25 + max_articles_per_feed = 20 remove_empty_feeds = True remove_javascript = True - preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')] + #preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')] + preprocess_regexps = [ + (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match: '

')] + preprocess_regexps = [ + (re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')] language = 'en_GB' masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif' - extra_css = 'h2 {font: sans-serif medium;}' + keep_only_tags = [ dict(name='h1'),dict(name='h2', attrs={'class':'h2'}), dict(attrs={'class':['img-cnt figure']}), dict(attrs={'class':['art-img']}), - - dict(name='div', attrs={'class':'art-lft'}) + dict(name='div', attrs={'class':'art-lft'}), + dict(name='p') ] remove_tags = [dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap', 'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r' ]}), - dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']}) - ] + dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']}) + ,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'}) + ] feeds = [ (u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')] + + extra_css = ''' + body {font: sans-serif medium;}' + h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;} + h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; } + span{ font-size:9.5px; font-weight:bold;font-style:italic} + p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} + + ''' From 211786644fb29e37c68c6887a7152802f7949d88 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Oct 2011 07:56:03 +0530 Subject: [PATCH 17/58] Revista Piaui by Eduardo Simoes --- recipes/revista_piaui.recipe | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 recipes/revista_piaui.recipe diff --git a/recipes/revista_piaui.recipe b/recipes/revista_piaui.recipe new file mode 100644 index 0000000000..22f0c24a02 --- /dev/null +++ b/recipes/revista_piaui.recipe @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +from calibre.web.feeds.news import BasicNewsRecipe + +class RevistaPiaui(BasicNewsRecipe): + title = u'Revista piau\xed' + language = 'pt_BR' + __author__ = u'Eduardo Gustini Simões' + oldest_article = 31 + max_articles_per_feed = 50 + auto_cleanup = True + + feeds = [(u'Edi\xe7\xe3o Atual', u'http://revistapiaui.estadao.com.br/feed/rss/edicao-atual.xml')] + + def parse_feeds (self): + feeds = BasicNewsRecipe.parse_feeds(self) + for feed in feeds: + for article in feed.articles[:]: + soup = self.index_to_soup('http://revistapiaui.estadao.com.br/feed/rss/edicao-atual.xml') + itemTitle = article.title.partition('|')[0].rstrip() + item = soup.find(text=itemTitle) + articleDescription = item.parent.parent.description.string.partition('
')[2] + article.summary = articleDescription + + return feeds + + def populate_article_metadata(self, article, soup, first): + h2 = soup.find('h2') + h2.string.replaceWith(h2.string.partition('|')[0].rstrip()) + h2.replaceWith(h2.prettify() + '

' + article.summary + '

' + ' posted at ' + article.localtime.strftime('%d-%m-%Y') + '

') From 0d41c10f4d97902db336147a08b9f27eb7ab19f5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 8 Oct 2011 08:00:19 +0530 Subject: [PATCH 18/58] Update Ming Pao --- recipes/ming_pao.recipe | 252 ++++++++++++++++++++++++++-------------- 1 file changed, 163 insertions(+), 89 deletions(-) diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index 9408d6c7d0..856d7166ff 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -6,19 +6,24 @@ __Region__ = 'Hong Kong' # Users of Kindle 3 with limited system-level CJK support # please replace the following "True" with "False". __MakePeriodical__ = True -# Turn below to true if your device supports display of CJK titles +# Turn below to True if your device supports display of CJK titles __UseChineseTitle__ = False # Set it to False if you want to skip images __KeepImages__ = True -# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source +# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source __UseLife__ = True -# (HK only) if __UseLife__ is true, turn this on if you want to include the column section +# (HK only) It is to disable the column section which is now a premium content __InclCols__ = False +# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats +__ParsePFF__ = False +# (HK only) Turn below to True if you wish hi-res images +__HiResImg__ = False ''' Change Log: -2011/09/21: fetching "column" section is made optional. Default is False +2011/10/04: option to get hi-res photos for the articles +2011/09/21: fetching "column" section is made optional. 2011/09/18: parse "column" section stuff from source text file directly. 2011/09/07: disable "column" section as it is no longer offered free. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source @@ -42,7 +47,7 @@ Change Log: 2010/10/31: skip repeated articles in section pages ''' -import os, datetime, re +import os, datetime, re, mechanize from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -56,7 +61,7 @@ class MPRecipe(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' category = 'Chinese, News, Hong Kong' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title @@ -147,43 +152,6 @@ class MPRecipe(BasicNewsRecipe): conversion_options = {'linearize_tables':True} timefmt = '' - def image_url_processor(cls, baseurl, url): - # trick: break the url at the first occurance of digit, add an additional - # '_' at the front - # not working, may need to move this to preprocess_html() method -# minIdx = 10000 -# i0 = url.find('0') -# if i0 >= 0 and i0 < minIdx: -# minIdx = i0 -# i1 = url.find('1') -# if i1 >= 0 and i1 < minIdx: -# minIdx = i1 -# i2 = url.find('2') -# if i2 >= 0 and i2 < minIdx: -# minIdx = i2 -# i3 = url.find('3') -# if i3 >= 0 and i0 < minIdx: -# minIdx = i3 -# i4 = url.find('4') -# if i4 >= 0 and i4 < minIdx: -# minIdx = i4 -# i5 = url.find('5') -# if i5 >= 0 and i5 < minIdx: -# minIdx = i5 -# i6 = url.find('6') -# if i6 >= 0 and i6 < minIdx: -# minIdx = i6 -# i7 = url.find('7') -# if i7 >= 0 and i7 < minIdx: -# minIdx = i7 -# i8 = url.find('8') -# if i8 >= 0 and i8 < minIdx: -# minIdx = i8 -# i9 = url.find('9') -# if i9 >= 0 and i9 < minIdx: -# minIdx = i9 - return url - def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() if __Region__ == 'Hong Kong': @@ -260,15 +228,16 @@ class MPRecipe(BasicNewsRecipe): else: for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), - (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: + (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), + (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) # special- editorial - ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') - if ed_articles: - feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) + #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') + #if ed_articles: + # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), @@ -279,20 +248,39 @@ class MPRecipe(BasicNewsRecipe): # special - finance #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') - fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') - if fin_articles: - feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) + #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') + #if fin_articles: + # feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) - for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), - (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: - articles = self.parse_section(url) + for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]: + articles = self.parse_section2(url, keystr) if articles: feeds.append((title, articles)) + #for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), + # (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: + # articles = self.parse_section(url) + # if articles: + # feeds.append((title, articles)) + # special - entertainment - ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') - if ent_articles: - feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') + #if ent_articles: + # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) + + for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal') + ]: + articles = self.parse_section2(url, keystr) + if articles: + feeds.append((title, articles)) + + if __InclCols__ == True: + # parse column section articles directly from .txt files + for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl') + ]: + articles = self.parse_section2_txt(url, keystr) + if articles: + feeds.append((title, articles)) for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: @@ -300,11 +288,6 @@ class MPRecipe(BasicNewsRecipe): if articles: feeds.append((title, articles)) - - # special- columns - col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn') - if col_articles: - feeds.append((u'\u5c08\u6b04 Columns', col_articles)) elif __Region__ == 'Vancouver': for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), @@ -348,6 +331,16 @@ class MPRecipe(BasicNewsRecipe): title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news.mingpao.com/' + dateStr + '/' +url + # replace the url to the print-friendly version + if __ParsePFF__ == True: + if url.rfind('Redirect') <> -1: + url = re.sub(dateStr + '.*' + dateStr, dateStr, url) + url = re.sub('%2F.*%2F', '/', url) + title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '') + url = url.replace('%2Etxt', '_print.htm') + url = url.replace('%5F', '_') + else: + url = url.replace('.htm', '_print.htm') if url not in included_urls and url.rfind('Redirect') == -1: current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) included_urls.append(url) @@ -472,38 +465,119 @@ class MPRecipe(BasicNewsRecipe): current_articles.reverse() return current_articles - # preprocess those .txt based files + # preprocess those .txt and javascript based files def preprocess_raw_html(self, raw_html, url): - if url.rfind('ftp') == -1: + #raw_html = raw_html.replace(u'

\u3010', u'\u3010') + if __HiResImg__ == True: + # TODO: add a _ in front of an image url + if url.rfind('news.mingpao.com') > -1: + imglist = re.findall('src="?.*?jpg"', raw_html) + br = mechanize.Browser() + br.set_handle_redirect(False) + for img in imglist: + gifimg = img.replace('jpg"', 'gif"') + try: + br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) + raw_html = raw_html.replace(img, gifimg) + except: + # find the location of the first _ + pos = img.find('_') + if pos > -1: + # if found, insert _ after the first _ + newimg = img[0:pos] + '_' + img[pos:] + raw_html = raw_html.replace(img, newimg) + else: + # if not found, insert _ after " + raw_html = raw_html.replace(img[1:], '"_' + img[1:]) + elif url.rfind('life.mingpao.com') > -1: + imglist = re.findall('src=\'?.*?jpg\'', raw_html) + br = mechanize.Browser() + br.set_handle_redirect(False) + #print 'Img list: ', imglist, '\n' + for img in imglist: + gifimg = img.replace('jpg\'', 'gif\'') + try: + #print 'Original: ', url + #print 'To append: ', "/../" + gifimg[5:len(gifimg)-1] + gifurl = re.sub(r'dailynews.*txt', '', url) + #print 'newurl: ', gifurl + gifimg[5:len(gifimg)-1] + br.open_novisit(gifurl + gifimg[5:len(gifimg)-1]) + #print 'URL: ', url + "/../" + gifimg[5:len(gifimg)-1] + #br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1]) + raw_html = raw_html.replace(img, gifimg) + except: + #print 'GIF not found' + pos = img.rfind('/') + newimg = img[0:pos+1] + '_' + img[pos+1:] + #print 'newimg: ', newimg + raw_html = raw_html.replace(img, newimg) + if url.rfind('ftp') == -1 and url.rfind('_print.htm') == -1: return raw_html else: - splitter = re.compile(r'\n') # Match non-digits - new_raw_html = 'Untitled

' - next_is_img_txt = False - title_started = False - met_article_start_char = False - for item in splitter.split(raw_html): - if item.startswith(u'\u3010'): - met_article_start_char = True - new_raw_html = new_raw_html + '

' + item + '

\n' - else: - if next_is_img_txt == False: - if item.startswith('='): - next_is_img_txt = True - new_raw_html += '

\n' + if url.rfind('_print.htm') <> -1: + # javascript based file + splitter = re.compile(r'\n') + new_raw_html = 'Untitled' + new_raw_html = new_raw_html + '' + for item in splitter.split(raw_html): + if item.startswith('var heading1 ='): + heading = item.replace('var heading1 = \'', '') + heading = heading.replace('\'', '') + heading = heading.replace(';', '') + new_raw_html = new_raw_html + '

' + heading + if item.startswith('var heading2 ='): + heading = item.replace('var heading2 = \'', '') + heading = heading.replace('\'', '') + heading = heading.replace(';', '') + if heading <> '': + new_raw_html = new_raw_html + '
' + heading + '
' else: - if met_article_start_char == False: - if title_started == False: - new_raw_html = new_raw_html + '
' + item + '\n' - title_started = True - else: - new_raw_html = new_raw_html + item + '\n' - else: - new_raw_html = new_raw_html + item + '

\n' + new_raw_html = new_raw_html + '

' + if item.startswith('var content ='): + content = item.replace("var content = ", '') + content = content.replace('\'', '') + content = content.replace(';', '') + new_raw_html = new_raw_html + '
' + content + '
' + if item.startswith('var photocontent ='): + photo = item.replace('var photocontent = \'', '') + photo = photo.replace('\'', '') + photo = photo.replace(';', '') + photo = photo.replace('', '') + photo = photo.replace('', '') + photo = photo.replace('', '') + photo = photo.replace('', '
') + photo = photo.replace('class="photo"', '') + new_raw_html = new_raw_html + '
' + photo + '
' + return new_raw_html + '' + else: + # .txt based file + splitter = re.compile(r'\n') # Match non-digits + new_raw_html = 'Untitled
' + next_is_img_txt = False + title_started = False + met_article_start_char = False + for item in splitter.split(raw_html): + if item.startswith(u'\u3010'): + met_article_start_char = True + new_raw_html = new_raw_html + '

' + item + '

\n' else: - next_is_img_txt = False - new_raw_html = new_raw_html + item + '\n' - return new_raw_html + '

' + if next_is_img_txt == False: + if item.startswith('='): + next_is_img_txt = True + new_raw_html += '

\n' + else: + if met_article_start_char == False: + if title_started == False: + new_raw_html = new_raw_html + '

' + item + '\n' + title_started = True + else: + new_raw_html = new_raw_html + item + '\n' + else: + new_raw_html = new_raw_html + item + '

\n' + else: + next_is_img_txt = False + new_raw_html = new_raw_html + item + '\n' + return new_raw_html + '

' def preprocess_html(self, soup): for item in soup.findAll(style=True): @@ -604,7 +678,7 @@ class MPRecipe(BasicNewsRecipe): if po is None: self.play_order_counter += 1 po = self.play_order_counter - parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), + parent.add_item('%sindex.html'%adir, None, a.title if a.title else ('Untitled Article'), play_order=po, author=auth, description=desc) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) for sp in a.sub_pages: From 372cc73c13abb6c0d2d9e7ce1613e638ad948a4f Mon Sep 17 00:00:00 2001 From: Kolenka Date: Sat, 8 Oct 2011 08:15:37 -0700 Subject: [PATCH 19/58] Sony T1: Fix variables so configuration is a bit more appropriate. --- src/calibre/devices/prst1/driver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 5c756cdabc..13f503fc00 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -42,8 +42,8 @@ class PRST1(USBMS): ) THUMBNAIL_HEIGHT = 144 - SCAN_FROM_ROOT = True - SUPPORT_SUB_DIRS = True + SUPPORTS_SUB_DIRS = True + MUST_READ_METADATA = True EBOOK_DIR_MAIN = 'Sony_Reader/media/books' EXTRA_CUSTOMIZATION_MESSAGE = [ From a2f2c3355c35f7b5b3d96172d60efee76fb864b2 Mon Sep 17 00:00:00 2001 From: Kolenka Date: Sat, 8 Oct 2011 18:17:33 -0700 Subject: [PATCH 20/58] Sony T1: Fix caching behavior so detection of books in the library is more consistent. Also fixes a minor bug with direct editing of collections on the device. --- src/calibre/devices/prst1/driver.py | 52 ++++++----------------------- 1 file changed, 11 insertions(+), 41 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 13f503fc00..8ec6a3d620 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -84,10 +84,9 @@ class PRST1(USBMS): prefix = self._card_a_prefix if oncard == 'carda' else self._main_prefix - # get the metadata cache + # Let parent driver get the books self.booklist_class.rebuild_collections = self.rebuild_collections - bl = self.booklist_class(oncard, prefix, self.settings) - need_sync = self.parse_metadata_cache(bl, prefix, self.METADATA_CACHE) + bl = USBMS.books(self, oncard=oncard, end_session=end_session) debug_print("SQLite DB Path: " + self.normalize_path(prefix + 'Sony_Reader/database/books.db')) @@ -108,47 +107,17 @@ class PRST1(USBMS): for i, row in enumerate(cursor): bl_collections.setdefault(row[0], []) bl_collections[row[0]].append(row[1]) - - # Query books themselves - query = 'select _id, file_path, title, author, mime_type, modified_date, thumbnail, file_size ' \ - 'from books' - cursor.execute (query) - - # make a dict cache of paths so the lookup in the loop below is faster. - bl_cache = {} - for idx,b in enumerate(bl): - bl_cache[b.lpath] = idx - - changed = False - for i, row in enumerate(cursor): - #Book(prefix, bookId, lpath, title, author, mime, date, thumbnail_name, size=None, other=None) - thumbnail = row[6] - if thumbnail is not None: - thumbnail = self.normalize_path(prefix + row[6]) + + for idx,book in enumerate(bl): + query = 'select _id from books where file_path = ?' + t = (book.lpath,) + cursor.execute (query, t) - book = Book(row[0], prefix, row[1], row[2], row[3], row[4], row[5], thumbnail, row[7]) - book.device_collections = bl_collections.get(row[0], None) - debug_print('Collections for ' + row[2] + ': ' + str(book.device_collections)) - bl_cache[row[1]] = None - if bl.add_book(book, replace_metadata=True): - changed = True - - # Remove books that are no longer in the filesystem. Cache contains - # indices into the booklist if book not in filesystem, None otherwise - # Do the operation in reverse order so indices remain valid - for idx in sorted(bl_cache.itervalues(), reverse=True): - if idx is not None: - changed = True - del bl[idx] + for i, row in enumerate(cursor): + book.device_collections = bl_collections.get(row[0], None) cursor.close() - if changed: - if oncard == 'carda': - self.sync_booklists((None, bl, None)) - else: - self.sync_booklists((bl, None, None)) - return bl def sync_booklists(self, booklists, end_session=True): @@ -269,7 +238,8 @@ class PRST1(USBMS): for book in books: if dbBooks.get(book.lpath, None) is None: - book.device_collections.append(collection) + if collection not in book.device_collections: + book.device_collections.append(collection) query = 'insert into collections (collection_id, content_id) values (?,?)' t = (dbCollections[collection], book.bookId) cursor.execute(query, t) From ac85376b2f9d16ec7984c0e57a27fabbe2aa8033 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sun, 9 Oct 2011 09:17:35 +0200 Subject: [PATCH 21/58] Fix coloring when using date fields in conditions. --- src/calibre/library/coloring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/coloring.py b/src/calibre/library/coloring.py index 584cb01e54..e1955077b3 100644 --- a/src/calibre/library/coloring.py +++ b/src/calibre/library/coloring.py @@ -133,7 +133,7 @@ class Rule(object): # {{{ 'lt': ('1', '', ''), 'gt': ('', '', '1') }[action] - return "cmp(format_date(raw_field('%s'), 'yyyy-MM-dd'), %s, '%s', '%s', '%s')" % (col, + return "strcmp(format_date(raw_field('%s'), 'yyyy-MM-dd'), '%s', '%s', '%s', '%s')" % (col, val, lt, eq, gt) def multiple_condition(self, col, action, val, sep): From 5507e72f45aa0dc642d0c966bfa7777d6f782b75 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sun, 9 Oct 2011 15:29:36 +0200 Subject: [PATCH 22/58] Add Amazon.fr --- src/calibre/customize/builtins.py | 11 ++ .../gui2/store/stores/amazon_fr_plugin.py | 114 ++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 src/calibre/gui2/store/stores/amazon_fr_plugin.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index a2c0596e0b..79e5259c00 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1143,6 +1143,16 @@ class StoreAmazonDEKindleStore(StoreBase): formats = ['KINDLE'] affiliate = True +class StoreAmazonFRKindleStore(StoreBase): + name = 'Amazon FR Kindle' + author = 'Charles Haley' + description = u'Tous les ebooks Kindle' + actual_plugin = 'calibre.gui2.store.stores.amazon_fr_plugin:AmazonFRKindleStore' + + headquarters = 'DE' + formats = ['KINDLE'] + affiliate = True + class StoreAmazonUKKindleStore(StoreBase): name = 'Amazon UK Kindle' author = 'Charles Haley' @@ -1520,6 +1530,7 @@ plugins += [ StoreArchiveOrgStore, StoreAmazonKindleStore, StoreAmazonDEKindleStore, + StoreAmazonFRKindleStore, StoreAmazonUKKindleStore, StoreBaenWebScriptionStore, StoreBNStore, diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py new file mode 100644 index 0000000000..a5b97751ca --- /dev/null +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +import urllib +from contextlib import closing + +from lxml import html + +from PyQt4.Qt import QUrl + +from calibre import browser +from calibre.gui2 import open_url +from calibre.gui2.store import StorePlugin +from calibre.gui2.store.search_result import SearchResult + +class AmazonFRKindleStore(StorePlugin): + ''' + For comments on the implementation, please see amazon_plugin.py + ''' + + def open(self, parent=None, detail_item=None, external=False): + aff_id = {'tag': 'charhale-21'} + store_link = 'http://www.amazon.fr/livres-kindle/b?ie=UTF8&node=695398031&ref_=sa_menu_kbo1&_encoding=UTF8&tag=%(tag)s&linkCode=ur2&camp=1642&creative=19458' % aff_id + + if detail_item: + aff_id['asin'] = detail_item + store_link = 'http://www.amazon.fr/gp/redirect.html?ie=UTF8&location=http://www.amazon.fr/dp/%(asin)s&tag=%(tag)s&linkCode=ur2&camp=1634&creative=6738' % aff_id + open_url(QUrl(store_link)) + + def search(self, query, max_results=10, timeout=60): + search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords=' + url = search_url + urllib.quote_plus(query) + br = browser() + + counter = max_results + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read()) + + data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]' + format_xpath = './/span[@class="format"]/text()' + cover_xpath = './/img[@class="productImage"]/@src' + + for data in doc.xpath(data_xpath): + if counter <= 0: + break + + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (author pages). So we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format.lower(): + continue + + # We must have an asin otherwise we can't easily reference the + # book later. + asin = ''.join(data.xpath("@name")) + + cover_url = ''.join(data.xpath(cover_xpath)) + + title = ''.join(data.xpath('.//div[@class="title"]/a/text()')) + price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()')) + author = unicode(''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()'))) + author = author.split('de ')[-1] + +# print (author, asin, cover_url, title, price) + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.formats = 'Kindle' + s.DRM = SearchResult.DRM_UNKNOWN + yield s + + def get_details(self, search_result, timeout): + # We might already have been called. + if search_result.drm: + return + + url = 'http://amazon.fr/dp/' + drm_search_text = u'Simultaneous Device Usage' + drm_free_text = u'Unlimited' + + br = browser() + with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: + idata = html.fromstring(nf.read()) + if not search_result.author: + search_result.author = ''.join(idata.xpath('//div[@class="buying" and contains(., "Author")]/a/text()')) + is_kindle = idata.xpath('boolean(//div[@class="buying"]/h1/span/span[contains(text(), "Kindle Edition")])') + if is_kindle: + search_result.formats = 'Kindle' + if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + drm_search_text + '")])'): + if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + + drm_free_text + '") and contains(b, "' + + drm_search_text + '")])'): + search_result.drm = SearchResult.DRM_UNLOCKED + else: + search_result.drm = SearchResult.DRM_UNKNOWN + else: + search_result.drm = SearchResult.DRM_LOCKED + return True + + From 7003ae2aa991052d226ba7b4da9e55d764e9438a Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 9 Oct 2011 10:54:19 -0400 Subject: [PATCH 23/58] Fix encoding issues when searching and displaying results for Amazon plugins. --- .../gui2/store/stores/amazon_de_plugin.py | 5 +-- .../gui2/store/stores/amazon_fr_plugin.py | 43 +++---------------- .../gui2/store/stores/amazon_plugin.py | 5 +-- .../gui2/store/stores/amazon_uk_plugin.py | 5 +-- 4 files changed, 11 insertions(+), 47 deletions(-) diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py index 9f26e765e6..4948a48714 100644 --- a/src/calibre/gui2/store/stores/amazon_de_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_de_plugin.py @@ -6,7 +6,6 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import urllib from contextlib import closing from lxml import html @@ -37,12 +36,12 @@ class AmazonDEKindleStore(StorePlugin): def search(self, query, max_results=10, timeout=60): search_url = 'http://www.amazon.de/s/?url=search-alias%3Ddigital-text&field-keywords=' - url = search_url + urllib.quote_plus(query) + url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) + doc = html.fromstring(f.read().decode('latin-1', 'replace')) # Amazon has two results pages. # 20110725: seems that is_shot is gone. diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py index a5b97751ca..186ca8d4b4 100644 --- a/src/calibre/gui2/store/stores/amazon_fr_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -6,7 +6,6 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import urllib from contextlib import closing from lxml import html @@ -34,12 +33,12 @@ class AmazonFRKindleStore(StorePlugin): def search(self, query, max_results=10, timeout=60): search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords=' - url = search_url + urllib.quote_plus(query) + url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) + doc = html.fromstring(f.read().decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]' format_xpath = './/span[@class="format"]/text()' @@ -66,9 +65,7 @@ class AmazonFRKindleStore(StorePlugin): title = ''.join(data.xpath('.//div[@class="title"]/a/text()')) price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()')) author = unicode(''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()'))) - author = author.split('de ')[-1] - -# print (author, asin, cover_url, title, price) + author = author.split('et ')[-1] counter -= 1 @@ -79,36 +76,6 @@ class AmazonFRKindleStore(StorePlugin): s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' - s.DRM = SearchResult.DRM_UNKNOWN + s.drm = SearchResult.DRM_UNKNOWN + yield s - - def get_details(self, search_result, timeout): - # We might already have been called. - if search_result.drm: - return - - url = 'http://amazon.fr/dp/' - drm_search_text = u'Simultaneous Device Usage' - drm_free_text = u'Unlimited' - - br = browser() - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if not search_result.author: - search_result.author = ''.join(idata.xpath('//div[@class="buying" and contains(., "Author")]/a/text()')) - is_kindle = idata.xpath('boolean(//div[@class="buying"]/h1/span/span[contains(text(), "Kindle Edition")])') - if is_kindle: - search_result.formats = 'Kindle' - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + - drm_search_text + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - drm_free_text + '") and contains(b, "' + - drm_search_text + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN - else: - search_result.drm = SearchResult.DRM_LOCKED - return True - - diff --git a/src/calibre/gui2/store/stores/amazon_plugin.py b/src/calibre/gui2/store/stores/amazon_plugin.py index 693ef883fb..89a6278535 100644 --- a/src/calibre/gui2/store/stores/amazon_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_plugin.py @@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en' import random import re -import urllib from contextlib import closing from lxml import html @@ -122,12 +121,12 @@ class AmazonKindleStore(StorePlugin): open_url(QUrl(store_link)) def search(self, query, max_results=10, timeout=60): - url = self.search_url + urllib.quote_plus(query) + url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) + doc = html.fromstring(f.read().decode('latin-1', 'replace')) # Amazon has two results pages. is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])') diff --git a/src/calibre/gui2/store/stores/amazon_uk_plugin.py b/src/calibre/gui2/store/stores/amazon_uk_plugin.py index 86603f3fc3..3b2a4d05cc 100644 --- a/src/calibre/gui2/store/stores/amazon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_uk_plugin.py @@ -6,7 +6,6 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import urllib from contextlib import closing from lxml import html @@ -34,12 +33,12 @@ class AmazonUKKindleStore(StorePlugin): def search(self, query, max_results=10, timeout=60): search_url = 'http://www.amazon.co.uk/s/?url=search-alias%3Ddigital-text&field-keywords=' - url = search_url + urllib.quote_plus(query) + url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) + doc = html.fromstring(f.read().decode('latin-1', 'replace')) # Amazon has two results pages. # 20110725: seems that is_shot is gone. From 9e114a9e1f763c8160c652eb45c1f80e85a527b5 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sun, 9 Oct 2011 17:39:25 +0200 Subject: [PATCH 24/58] Clean up/fix code to strip leading "by " in German and French. --- .../gui2/store/stores/amazon_de_plugin.py | 17 ++--------------- .../gui2/store/stores/amazon_fr_plugin.py | 5 +++-- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py index 4948a48714..ea92839268 100644 --- a/src/calibre/gui2/store/stores/amazon_de_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_de_plugin.py @@ -43,20 +43,9 @@ class AmazonDEKindleStore(StorePlugin): with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read().decode('latin-1', 'replace')) - # Amazon has two results pages. - # 20110725: seems that is_shot is gone. -# is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])') -# # Horizontal grid of books. -# if is_shot: -# data_xpath = '//div[contains(@class, "result")]' -# format_xpath = './/div[@class="productTitle"]/text()' -# cover_xpath = './/div[@class="productTitle"]//img/@src' -# # Vertical list of books. -# else: data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]' format_xpath = './/span[@class="format"]/text()' cover_xpath = './/img[@class="productImage"]/@src' -# end is_shot else for data in doc.xpath(data_xpath): if counter <= 0: @@ -79,11 +68,9 @@ class AmazonDEKindleStore(StorePlugin): title = ''.join(data.xpath('.//div[@class="title"]/a/text()')) price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()')) -# if is_shot: -# author = format.split(' von ')[-1] -# else: author = ''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()')) - author = author.split('von ')[-1] + if author.startswith('von '): + author = author[4:] counter -= 1 diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py index 186ca8d4b4..ca36f1055b 100644 --- a/src/calibre/gui2/store/stores/amazon_fr_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -65,7 +65,8 @@ class AmazonFRKindleStore(StorePlugin): title = ''.join(data.xpath('.//div[@class="title"]/a/text()')) price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()')) author = unicode(''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()'))) - author = author.split('et ')[-1] + if author.startswith('de '): + author = author[3:] counter -= 1 @@ -77,5 +78,5 @@ class AmazonFRKindleStore(StorePlugin): s.detail_item = asin.strip() s.formats = 'Kindle' s.drm = SearchResult.DRM_UNKNOWN - + yield s From 18ed5671c699b90dbee72dd7dd2a4e426ee84148 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sun, 9 Oct 2011 17:48:40 +0200 Subject: [PATCH 25/58] Fix 'by' splitting in the UK plugin. --- .../gui2/store/stores/amazon_uk_plugin.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/calibre/gui2/store/stores/amazon_uk_plugin.py b/src/calibre/gui2/store/stores/amazon_uk_plugin.py index 3b2a4d05cc..ef15951d50 100644 --- a/src/calibre/gui2/store/stores/amazon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_uk_plugin.py @@ -40,20 +40,9 @@ class AmazonUKKindleStore(StorePlugin): with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read().decode('latin-1', 'replace')) - # Amazon has two results pages. - # 20110725: seems that is_shot is gone. -# is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])') -# # Horizontal grid of books. -# if is_shot: -# data_xpath = '//div[contains(@class, "result")]' -# format_xpath = './/div[@class="productTitle"]/text()' -# cover_xpath = './/div[@class="productTitle"]//img/@src' -# # Vertical list of books. -# else: data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]' format_xpath = './/span[@class="format"]/text()' cover_xpath = './/img[@class="productImage"]/@src' -# end is_shot else for data in doc.xpath(data_xpath): if counter <= 0: @@ -76,11 +65,9 @@ class AmazonUKKindleStore(StorePlugin): title = ''.join(data.xpath('.//div[@class="title"]/a/text()')) price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()')) -# if is_shot: -# author = format.split(' von ')[-1] -# else: author = ''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()')) - author = author.split('by ')[-1] + if author.startswith('by '): + author = author[3:] counter -= 1 From dcd4e7a6b0759679d0ec1358e650ecb4f274ac52 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sun, 9 Oct 2011 18:02:33 +0200 Subject: [PATCH 26/58] Formatting change to list_equals documentation --- src/calibre/manual/template_lang.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/manual/template_lang.rst b/src/calibre/manual/template_lang.rst index 8953bda2f7..8b5329372b 100644 --- a/src/calibre/manual/template_lang.rst +++ b/src/calibre/manual/template_lang.rst @@ -266,7 +266,7 @@ The following functions are available in addition to those described in single-f * ``has_cover()`` -- return ``Yes`` if the book has a cover, otherwise return the empty string * ``not(value)`` -- returns the string "1" if the value is empty, otherwise returns the empty string. This function works well with test or first_non_empty. You can have as many values as you want. * ``list_difference(list1, list2, separator)`` -- return a list made by removing from `list1` any item found in `list2`, using a case-insensitive compare. The items in `list1` and `list2` are separated by separator, as are the items in the returned list. - * ``list_equals(list1, sep1, list2, sep2, yes_val, no_val) -- return `yes_val` if list1 and list2 contain the same items, otherwise return `no_val`. The items are determined by splitting each list using the appropriate separator character (`sep1` or `sep2`). The order of items in the lists is not relevant. The compare is case insensitive. + * ``list_equals(list1, sep1, list2, sep2, yes_val, no_val)`` -- return `yes_val` if `list1` and `list2` contain the same items, otherwise return `no_val`. The items are determined by splitting each list using the appropriate separator character (`sep1` or `sep2`). The order of items in the lists is not relevant. The compare is case insensitive. * ``list_intersection(list1, list2, separator)`` -- return a list made by removing from `list1` any item not found in `list2`, using a case-insensitive compare. The items in `list1` and `list2` are separated by separator, as are the items in the returned list. * ``list_sort(list, direction, separator)`` -- return list sorted using a case-insensitive sort. If `direction` is zero, the list is sorted ascending, otherwise descending. The list items are separated by separator, as are the items in the returned list. * ``list_union(list1, list2, separator)`` -- return a list made by merging the items in list1 and list2, removing duplicate items using a case-insensitive compare. If items differ in case, the one in list1 is used. The items in list1 and list2 are separated by separator, as are the items in the returned list. From ff7f90c2eced345d9e9d2fa57ec056164a3b38dd Mon Sep 17 00:00:00 2001 From: Kolenka Date: Sun, 9 Oct 2011 09:28:02 -0700 Subject: [PATCH 27/58] Sony T1: Support for copying covers similar to the 505 driver. Performs the copy during sync_booklists due to having access to the book's id/lpath at that point, and being able to detect new books. When upload_cover is normally called, this information is usually not accessible, and the book isn't actually in the database. It's less fragile this way. Fixes an issue with setting floating point values in 'added_time' column. Also removes Book from books.py (not needed at this point) --- src/calibre/devices/prst1/books.py | 35 -------------- src/calibre/devices/prst1/driver.py | 73 +++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 39 deletions(-) diff --git a/src/calibre/devices/prst1/books.py b/src/calibre/devices/prst1/books.py index a840d13b5a..40e70f2af0 100644 --- a/src/calibre/devices/prst1/books.py +++ b/src/calibre/devices/prst1/books.py @@ -3,41 +3,6 @@ __copyright__ = '2010, Timothy Legge ' ''' ''' -import os -import time - -from calibre.devices.usbms.books import Book as Book_ - -class Book(Book_): - - def __init__(self, bookId, prefix, lpath, title, author, mime, date, thumbnail_name, size=None, other=None): - Book_.__init__(self, prefix, lpath) - - self.bookId = bookId - self.title = title - if not author: - self.authors = [''] - else: - self.authors = [author] - - if not title: - self.title = _('Unknown') - - self.mime = mime - - self.size = size # will be set later if None - - try: - self.datetime = time.gmtime(os.path.getctime(self.path)) - except: - self.datetime = time.gmtime() - - if thumbnail_name is not None: - self.thumbnail = ImageWrapper(thumbnail_name) - self.tags = [] - if other: - self.smart_update(other) - class ImageWrapper(object): def __init__(self, image_path): self.image_path = image_path diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 8ec6a3d620..0034224ac9 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -18,7 +18,7 @@ from calibre.devices.usbms.driver import USBMS, debug_print from calibre import __appname__, prints from calibre.devices.usbms.books import CollectionsBookList from calibre.devices.usbms.books import BookList -from calibre.devices.prst1.books import Book +from calibre.devices.prst1.books import ImageWrapper class PRST1(USBMS): name = 'SONY PRST1 and newer Device Interface' @@ -50,12 +50,41 @@ class PRST1(USBMS): _('Comma separated list of metadata fields ' 'to turn into collections on the device. Possibilities include: ')+\ 'series, tags, authors', + _('Upload separate cover thumbnails for books') + + ':::'+_('Normally, the SONY readers get the cover image from the' + ' ebook file itself. With this option, calibre will send a ' + 'separate cover image to the reader, useful if you are ' + 'sending DRMed books in which you cannot change the cover.'), + _('Refresh separate covers when using automatic management') + + ':::' + + _('Set this option to have separate book covers uploaded ' + 'every time you connect your device. Unset this option if ' + 'you have so many books on the reader that performance is ' + 'unacceptable.'), + _('Preserve cover aspect ratio when building thumbnails') + + ':::' + + _('Set this option if you want the cover thumbnails to have ' + 'the same aspect ratio (width to height) as the cover. ' + 'Unset it if you want the thumbnail to be the maximum size, ' + 'ignoring aspect ratio.'), ] EXTRA_CUSTOMIZATION_DEFAULT = [ ', '.join(['series', 'tags']), + False, + False, + True, ] OPT_COLLECTIONS = 0 + OPT_UPLOAD_COVERS = 1 + OPT_REFRESH_COVERS = 2 + OPT_PRESERVE_ASPECT_RATIO = 3 + + def post_open_callback(self): + # Set the thumbnail width to the theoretical max if the user has asked + # that we do not preserve aspect ratio + if not self.settings().extra_customization[self.OPT_PRESERVE_ASPECT_RATIO]: + self.THUMBNAIL_WIDTH = 108 def windows_filter_pnp_id(self, pnp_id): return '_LAUNCHER' in pnp_id or '_SETTING' in pnp_id @@ -109,12 +138,17 @@ class PRST1(USBMS): bl_collections[row[0]].append(row[1]) for idx,book in enumerate(bl): - query = 'select _id from books where file_path = ?' + query = 'select _id, thumbnail from books where file_path = ?' t = (book.lpath,) cursor.execute (query, t) for i, row in enumerate(cursor): - book.device_collections = bl_collections.get(row[0], None) + book.device_collections = bl_collections.get(row[0], None) + thumbnail = row[1] + if thumbnail is not None: + thumbnail = self.normalize_path(prefix + thumbnail) + book.thumbnail = ImageWrapper(thumbnail) + debug_print('Got thumnail for :' + book.title) cursor.close() @@ -155,6 +189,10 @@ class PRST1(USBMS): debug_print('PRST1: finished update_device_database') def update_device_books(self, connection, booklist, source_id): + opts = self.settings() + upload_covers = opts.extra_customization[self.OPT_UPLOAD_COVERS]; + refresh_covers = opts.extra_customization[self.OPT_REFRESH_COVERS] + cursor = connection.cursor() # Get existing books @@ -173,9 +211,11 @@ class PRST1(USBMS): query = 'insert into books ' \ '(title, author, source_id, added_date, modified_date, file_path, file_name, file_size, mime_type, corrupted, prevent_delete) ' \ 'values (?,?,?,?,?,?,?,?,?,0,0)' - t = (book.title, book.authors[0], source_id, time.time() * 1000, calendar.timegm(book.datetime), lpath, os.path.basename(book.lpath), book.size, book.mime ) + t = (book.title, book.authors[0], source_id, int(time.time() * 1000), calendar.timegm(book.datetime), lpath, os.path.basename(book.lpath), book.size, book.mime ) cursor.execute(query, t) book.bookId = cursor.lastrowid + if upload_covers: + self.upload_book_cover(connection, book, source_id) debug_print('Inserted New Book: ' + book.title) else: query = 'update books ' \ @@ -184,6 +224,8 @@ class PRST1(USBMS): t = (book.title, book.authors[0], calendar.timegm(book.datetime), book.size, lpath) cursor.execute(query, t) book.bookId = dbBooks[lpath] + if refresh_covers: + self.upload_book_cover(connection, book, source_id) dbBooks[lpath] = None for book, bookId in dbBooks.items(): @@ -289,4 +331,27 @@ class PRST1(USBMS): self.update_device_database(booklist, collections, oncard) debug_print('PRS-T1: finished rebuild_collections') + + def upload_book_cover(self, connection, book, source_id): + debug_print('PRST1: Uploading/Refreshing Cover for ' + book.title) + cursor = connection.cursor() + if book.thumbnail and book.thumbnail[-1]: + thumbnailPath = 'Sony_Reader/database/cache/books/' + str(book.bookId) +'/thumbnail/main_thumbnail.jpg' + + prefix = self._main_prefix if source_id is 0 else self._card_a_prefix + thumbnailFilePath = os.path.join(prefix, *thumbnailPath.split('/')) + thumbnailDirPath = os.path.dirname(thumbnailFilePath) + if not os.path.exists(thumbnailDirPath): + os.makedirs(thumbnailDirPath) + + with open(thumbnailFilePath, 'wb') as f: + f.write(book.thumbnail[-1]) + + query = 'update books ' \ + 'set thumbnail = ?' \ + 'where _id = ? ' + t = (thumbnailPath,book.bookId,) + cursor.execute(query, t) + + cursor.close() \ No newline at end of file From 2b63af44ad1d5f72127dc301ba975b210c7a4b90 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 10 Oct 2011 04:21:05 +0530 Subject: [PATCH 28/58] Economist, get larger cover --- recipes/economist.recipe | 10 ++++++++-- recipes/economist_free.recipe | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 7dc869bf74..0a75706f5b 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -22,8 +22,6 @@ class Economist(BasicNewsRecipe): ' perspective. Best downloaded on Friday mornings (GMT)') extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }' oldest_article = 7.0 - cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg' - #cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg' remove_tags = [ dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), dict(attrs={'class':['dblClkTrk', 'ec-article-info', @@ -56,6 +54,14 @@ class Economist(BasicNewsRecipe): return br ''' + def get_cover_url(self): + br = self.browser + br.open(self.INDEX) + issue = br.geturl().split('/')[4] + self.log('Fetching cover for issue: %s'%issue) + cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-')) + return cover_url + def parse_index(self): return self.economist_parse_index() diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index 5f45a6ab8f..8d446d7de3 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -22,8 +22,6 @@ class Economist(BasicNewsRecipe): ' perspective. Best downloaded on Friday mornings (GMT)') extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }' oldest_article = 7.0 - cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg' - #cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg' remove_tags = [ dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), dict(attrs={'class':['dblClkTrk', 'ec-article-info', @@ -40,6 +38,14 @@ class Economist(BasicNewsRecipe): # downloaded with connection reset by peer (104) errors. delay = 1 + def get_cover_url(self): + br = self.browser + br.open(self.INDEX) + issue = br.geturl().split('/')[4] + self.log('Fetching cover for issue: %s'%issue) + cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-')) + return cover_url + def parse_index(self): try: From d808ffd23148dbc873bce2821883e51c0896d43b Mon Sep 17 00:00:00 2001 From: Kolenka Date: Sun, 9 Oct 2011 17:00:24 -0700 Subject: [PATCH 29/58] Sony T1: Support "added_order" for collections, and enable plugboards. --- src/calibre/devices/prst1/driver.py | 49 +++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 0034224ac9..0987e21a1c 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -31,6 +31,7 @@ class PRST1(USBMS): FORMATS = ['epub', 'pdf', 'txt'] CAN_SET_METADATA = ['title', 'authors', 'collections'] + CAN_DO_DEVICE_DB_PLUGBOARD = True VENDOR_ID = [0x054c] #: SONY Vendor Id PRODUCT_ID = [0x05c2] @@ -80,6 +81,9 @@ class PRST1(USBMS): OPT_REFRESH_COVERS = 2 OPT_PRESERVE_ASPECT_RATIO = 3 + plugboards = None + plugboard_func = None + def post_open_callback(self): # Set the thumbnail width to the theoretical max if the user has asked # that we do not preserve aspect ratio @@ -148,12 +152,15 @@ class PRST1(USBMS): if thumbnail is not None: thumbnail = self.normalize_path(prefix + thumbnail) book.thumbnail = ImageWrapper(thumbnail) - debug_print('Got thumnail for :' + book.title) cursor.close() return bl + def set_plugboards(self, plugboards, pb_func): + self.plugboards = plugboards + self.plugboard_func = pb_func + def sync_booklists(self, booklists, end_session=True): debug_print('PRST1: starting sync_booklists') @@ -176,6 +183,11 @@ class PRST1(USBMS): def update_device_database(self, booklist, collections_attributes, oncard): debug_print('PRST1: starting update_device_database') + plugboard = None + if self.plugboard_func: + plugboard = self.plugboard_func(self.__class__.__name__, 'device_db', self.plugboards) + debug_print("PRST1: Using Plugboard", plugboard) + prefix = self._card_a_prefix if oncard == 'carda' else self._main_prefix source_id = 1 if oncard == 'carda' else 0 debug_print("SQLite DB Path: " + self.normalize_path(prefix + 'Sony_Reader/database/books.db')) @@ -183,14 +195,14 @@ class PRST1(USBMS): collections = booklist.get_collections(collections_attributes) with closing(sqlite.connect(self.normalize_path(prefix + 'Sony_Reader/database/books.db'))) as connection: - self.update_device_books(connection, booklist, source_id) + self.update_device_books(connection, booklist, source_id, plugboard) self.update_device_collections(connection, booklist, collections, source_id) debug_print('PRST1: finished update_device_database') - def update_device_books(self, connection, booklist, source_id): + def update_device_books(self, connection, booklist, source_id, plugboard): opts = self.settings() - upload_covers = opts.extra_customization[self.OPT_UPLOAD_COVERS]; + upload_covers = opts.extra_customization[self.OPT_UPLOAD_COVERS] refresh_covers = opts.extra_customization[self.OPT_REFRESH_COVERS] cursor = connection.cursor() @@ -205,13 +217,24 @@ class PRST1(USBMS): lpath = row[0].replace('\\', '/') dbBooks[lpath] = row[1] - for book in booklist: + for book in booklist: + # Run through plugboard if needed + if plugboard is not None: + newmi = book.deepcopy_metadata() + newmi.template_to_attribute(book, plugboard) + else: + newmi = book + + # Get Metadata We Want lpath = book.lpath + author = newmi.authors[0] + title = newmi.title + if lpath not in dbBooks: query = 'insert into books ' \ '(title, author, source_id, added_date, modified_date, file_path, file_name, file_size, mime_type, corrupted, prevent_delete) ' \ 'values (?,?,?,?,?,?,?,?,?,0,0)' - t = (book.title, book.authors[0], source_id, int(time.time() * 1000), calendar.timegm(book.datetime), lpath, os.path.basename(book.lpath), book.size, book.mime ) + t = (title, author, source_id, int(time.time() * 1000), calendar.timegm(book.datetime), lpath, os.path.basename(book.lpath), book.size, book.mime ) cursor.execute(query, t) book.bookId = cursor.lastrowid if upload_covers: @@ -221,7 +244,7 @@ class PRST1(USBMS): query = 'update books ' \ 'set title = ?, author = ?, modified_date = ?, file_size = ? ' \ 'where file_path = ?' - t = (book.title, book.authors[0], calendar.timegm(book.datetime), book.size, lpath) + t = (title, author, calendar.timegm(book.datetime), book.size, lpath) cursor.execute(query, t) book.bookId = dbBooks[lpath] if refresh_covers: @@ -278,14 +301,20 @@ class PRST1(USBMS): for i, row in enumerate(cursor): dbBooks[row[0]] = row[1] - for book in books: + for idx, book in enumerate(books): if dbBooks.get(book.lpath, None) is None: if collection not in book.device_collections: book.device_collections.append(collection) - query = 'insert into collections (collection_id, content_id) values (?,?)' - t = (dbCollections[collection], book.bookId) + query = 'insert into collections (collection_id, content_id, added_order) values (?,?,?)' + t = (dbCollections[collection], book.bookId, idx) cursor.execute(query, t) debug_print('Inserted Book Into Collection: ' + book.title + ' -> ' + collection) + else: + query = 'update collections ' \ + 'set added_order = ? ' \ + 'where content_id = ? and collection_id = ? ' + t = (idx, book.bookId, dbCollections[collection]) + cursor.execute(query, t) dbBooks[book.lpath] = None From 948d176e242c2700d620056825f847ea2de1d341 Mon Sep 17 00:00:00 2001 From: Kolenka Date: Sun, 9 Oct 2011 17:17:03 -0700 Subject: [PATCH 30/58] Sony T1: Disable editing title/author on the device directly. It does no good --- src/calibre/devices/prst1/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 0987e21a1c..0fde160e50 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -30,7 +30,7 @@ class PRST1(USBMS): booklist_class = CollectionsBookList FORMATS = ['epub', 'pdf', 'txt'] - CAN_SET_METADATA = ['title', 'authors', 'collections'] + CAN_SET_METADATA = ['collections'] CAN_DO_DEVICE_DB_PLUGBOARD = True VENDOR_ID = [0x054c] #: SONY Vendor Id From 304a0ae408dbaa306eaac956202b4442c0bda50a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 10 Oct 2011 09:23:50 +0530 Subject: [PATCH 31/58] Ensure that mount points on linux are never blank. Also make find_device_nodes reusable --- src/calibre/devices/interface.py | 2 +- src/calibre/devices/usbms/device.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py index 4877cd359e..56c950bd16 100644 --- a/src/calibre/devices/interface.py +++ b/src/calibre/devices/interface.py @@ -217,7 +217,7 @@ class DevicePlugin(Plugin): ''' Unix version of :meth:`can_handle_windows` - :param device_info: Is a tupe of (vid, pid, bcd, manufacturer, product, + :param device_info: Is a tuple of (vid, pid, bcd, manufacturer, product, serial number) ''' diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index c4f2ec26ed..85ab5905b9 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -483,7 +483,7 @@ class Device(DeviceConfig, DevicePlugin): self._card_a_prefix = get_card_prefix('carda') self._card_b_prefix = get_card_prefix('cardb') - def find_device_nodes(self): + def find_device_nodes(self, detected_device=None): def walk(base): base = os.path.abspath(os.path.realpath(base)) @@ -507,8 +507,11 @@ class Device(DeviceConfig, DevicePlugin): d, j = os.path.dirname, os.path.join usb_dir = None + if detected_device is None: + detected_device = self.detected_device + def test(val, attr): - q = getattr(self.detected_device, attr) + q = getattr(detected_device, attr) return q == val for x, isfile in walk('/sys/devices'): @@ -596,6 +599,8 @@ class Device(DeviceConfig, DevicePlugin): label = self.STORAGE_CARD2_VOLUME_LABEL if not label: label = self.STORAGE_CARD_VOLUME_LABEL + ' 2' + if not label: + label = 'E-book Reader (%s)'%type extra = 0 while True: q = ' (%d)'%extra if extra else '' From 73621c90014389f7ef2f860d2dc2acca2ea53166 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 10 Oct 2011 10:23:43 +0530 Subject: [PATCH 32/58] Fix #870457 (Patch: Sony T1 Initial Support) --- src/calibre/customize/builtins.py | 3 +- src/calibre/devices/prs505/driver.py | 37 --- src/calibre/devices/prst1/__init__.py | 7 + src/calibre/devices/prst1/driver.py | 427 ++++++++++++++++++++++++++ 4 files changed, 436 insertions(+), 38 deletions(-) create mode 100644 src/calibre/devices/prst1/__init__.py create mode 100644 src/calibre/devices/prst1/driver.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index a2c0596e0b..4dccb05092 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -555,7 +555,8 @@ from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800 from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI from calibre.devices.kindle.driver import KINDLE, KINDLE2, KINDLE_DX from calibre.devices.nook.driver import NOOK, NOOK_COLOR -from calibre.devices.prs505.driver import PRS505, PRST1 +from calibre.devices.prs505.driver import PRS505 +from calibre.devices.prst1.driver import PRST1 from calibre.devices.user_defined.driver import USER_DEFINED from calibre.devices.android.driver import ANDROID, S60, WEBOS from calibre.devices.nokia.driver import N770, N810, E71X, E52 diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index df436bfd9f..4d9c66aaa8 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -299,40 +299,3 @@ class PRS505(USBMS): f.write(metadata.thumbnail[-1]) debug_print('Cover uploaded to: %r'%cpath) -class PRST1(USBMS): - name = 'SONY PRST1 and newer Device Interface' - gui_name = 'SONY Reader' - description = _('Communicate with Sony PRST1 and newer eBook readers') - author = 'Kovid Goyal' - supported_platforms = ['windows', 'osx', 'linux'] - - FORMATS = ['epub', 'lrf', 'lrx', 'rtf', 'pdf', 'txt'] - VENDOR_ID = [0x054c] #: SONY Vendor Id - PRODUCT_ID = [0x05c2] - BCD = [0x226] - - VENDOR_NAME = 'SONY' - WINDOWS_MAIN_MEM = re.compile( - r'(PRS-T1&)' - ) - - THUMBNAIL_HEIGHT = 217 - SCAN_FROM_ROOT = True - EBOOK_DIR_MAIN = __appname__ - SUPPORTS_SUB_DIRS = True - - def windows_filter_pnp_id(self, pnp_id): - return '_LAUNCHER' in pnp_id or '_SETTING' in pnp_id - - def get_carda_ebook_dir(self, for_upload=False): - if for_upload: - return __appname__ - return self.EBOOK_DIR_CARD_A - - def get_main_ebook_dir(self, for_upload=False): - if for_upload: - return __appname__ - return '' - - - diff --git a/src/calibre/devices/prst1/__init__.py b/src/calibre/devices/prst1/__init__.py new file mode 100644 index 0000000000..4ed1c1cbbe --- /dev/null +++ b/src/calibre/devices/prst1/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py new file mode 100644 index 0000000000..327334aaec --- /dev/null +++ b/src/calibre/devices/prst1/driver.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +''' +Device driver for the SONY T1 devices +''' + +import os, time, calendar, re +import sqlite3 as sqlite +from contextlib import closing + +from calibre.devices.usbms.driver import USBMS, debug_print +from calibre.devices.usbms.device import USBDevice +from calibre.devices.usbms.books import CollectionsBookList +from calibre.devices.usbms.books import BookList +from calibre.constants import islinux + +DBPATH = 'Sony_Reader/database/books.db' +THUMBPATH = 'Sony_Reader/database/cache/books/%s/thumbnail/main_thumbnail.jpg' + +class ImageWrapper(object): + def __init__(self, image_path): + self.image_path = image_path + +class PRST1(USBMS): + name = 'SONY PRST1 and newer Device Interface' + gui_name = 'SONY Reader' + description = _('Communicate with the PRST1 and newer SONY eBook readers') + author = 'Kovid Goyal' + supported_platforms = ['windows', 'osx', 'linux'] + path_sep = '/' + booklist_class = CollectionsBookList + + FORMATS = ['epub', 'pdf', 'txt'] + CAN_SET_METADATA = ['collections'] + CAN_DO_DEVICE_DB_PLUGBOARD = True + + VENDOR_ID = [0x054c] #: SONY Vendor Id + PRODUCT_ID = [0x05c2] + BCD = [0x226] + + VENDOR_NAME = 'SONY' + WINDOWS_MAIN_MEM = re.compile( + r'(PRS-T1&)' + ) + MAIN_MEMORY_VOLUME_LABEL = 'SONY Reader Main Memory' + STORAGE_CARD_VOLUME_LABEL = 'SONY Reader Storage Card' + + THUMBNAIL_HEIGHT = 144 + SUPPORTS_SUB_DIRS = True + MUST_READ_METADATA = True + EBOOK_DIR_MAIN = 'Sony_Reader/media/books' + + EXTRA_CUSTOMIZATION_MESSAGE = [ + _('Comma separated list of metadata fields ' + 'to turn into collections on the device. Possibilities include: ')+\ + 'series, tags, authors', + _('Upload separate cover thumbnails for books') + + ':::'+_('Normally, the SONY readers get the cover image from the' + ' ebook file itself. With this option, calibre will send a ' + 'separate cover image to the reader, useful if you are ' + 'sending DRMed books in which you cannot change the cover.'), + _('Refresh separate covers when using automatic management') + + ':::' + + _('Set this option to have separate book covers uploaded ' + 'every time you connect your device. Unset this option if ' + 'you have so many books on the reader that performance is ' + 'unacceptable.'), + _('Preserve cover aspect ratio when building thumbnails') + + ':::' + + _('Set this option if you want the cover thumbnails to have ' + 'the same aspect ratio (width to height) as the cover. ' + 'Unset it if you want the thumbnail to be the maximum size, ' + 'ignoring aspect ratio.'), + ] + EXTRA_CUSTOMIZATION_DEFAULT = [ + ', '.join(['series', 'tags']), + True, + False, + True, + ] + + OPT_COLLECTIONS = 0 + OPT_UPLOAD_COVERS = 1 + OPT_REFRESH_COVERS = 2 + OPT_PRESERVE_ASPECT_RATIO = 3 + + plugboards = None + plugboard_func = None + + def post_open_callback(self): + # Set the thumbnail width to the theoretical max if the user has asked + # that we do not preserve aspect ratio + if not self.settings().extra_customization[self.OPT_PRESERVE_ASPECT_RATIO]: + self.THUMBNAIL_WIDTH = 108 + + def windows_filter_pnp_id(self, pnp_id): + return '_LAUNCHER' in pnp_id or '_SETTING' in pnp_id + + def get_carda_ebook_dir(self, for_upload=False): + if for_upload: + return self.EBOOK_DIR_MAIN + return self.EBOOK_DIR_CARD_A + + def get_main_ebook_dir(self, for_upload=False): + if for_upload: + return self.EBOOK_DIR_MAIN + return '' + + def can_handle(self, devinfo, debug=False): + if islinux: + dev = USBDevice(devinfo) + main, carda, cardb = self.find_device_nodes(detected_device=dev) + if main is None and carda is None and cardb is None: + if debug: + print ('\tPRS-T1: Appears to be in non data mode' + ' or was ejected, ignoring') + return False + return True + + def books(self, oncard=None, end_session=True): + dummy_bl = BookList(None, None, None) + + if ( + (oncard == 'carda' and not self._card_a_prefix) or + (oncard and oncard != 'carda') + ): + self.report_progress(1.0, _('Getting list of books on device...')) + return dummy_bl + + prefix = self._card_a_prefix if oncard == 'carda' else self._main_prefix + + # Let parent driver get the books + self.booklist_class.rebuild_collections = self.rebuild_collections + bl = USBMS.books(self, oncard=oncard, end_session=end_session) + + dbpath = self.normalize_path(prefix + DBPATH) + debug_print("SQLite DB Path: " + dbpath) + + with closing(sqlite.connect(dbpath)) as connection: + # Replace undecodable characters in the db instead of erroring out + connection.text_factory = lambda x: unicode(x, "utf-8", "replace") + + cursor = connection.cursor() + # Query collections + query = ''' + SELECT books._id, collection.title + FROM collections + LEFT OUTER JOIN books + LEFT OUTER JOIN collection + WHERE collections.content_id = books._id AND + collections.collection_id = collection._id + ''' + cursor.execute(query) + + bl_collections = {} + for i, row in enumerate(cursor): + bl_collections.setdefault(row[0], []) + bl_collections[row[0]].append(row[1]) + + for idx, book in enumerate(bl): + query = 'SELECT _id, thumbnail FROM books WHERE file_path = ?' + t = (book.lpath,) + cursor.execute (query, t) + + for i, row in enumerate(cursor): + book.device_collections = bl_collections.get(row[0], None) + thumbnail = row[1] + if thumbnail is not None: + thumbnail = self.normalize_path(prefix + thumbnail) + book.thumbnail = ImageWrapper(thumbnail) + + cursor.close() + + return bl + + def set_plugboards(self, plugboards, pb_func): + self.plugboards = plugboards + self.plugboard_func = pb_func + + def sync_booklists(self, booklists, end_session=True): + debug_print('PRST1: starting sync_booklists') + + opts = self.settings() + if opts.extra_customization: + collections = [x.strip() for x in + opts.extra_customization[self.OPT_COLLECTIONS].split(',')] + else: + collections = [] + debug_print('PRST1: collection fields:', collections) + + if booklists[0] is not None: + self.update_device_database(booklists[0], collections, None) + if booklists[1] is not None: + self.update_device_database(booklists[1], collections, 'carda') + + USBMS.sync_booklists(self, booklists, end_session=end_session) + debug_print('PRST1: finished sync_booklists') + + def update_device_database(self, booklist, collections_attributes, oncard): + debug_print('PRST1: starting update_device_database') + + plugboard = None + if self.plugboard_func: + plugboard = self.plugboard_func(self.__class__.__name__, + 'device_db', self.plugboards) + debug_print("PRST1: Using Plugboard", plugboard) + + prefix = self._card_a_prefix if oncard == 'carda' else self._main_prefix + if prefix is None: + # Reader has no sd card inserted + return + source_id = 1 if oncard == 'carda' else 0 + + dbpath = self.normalize_path(prefix + DBPATH) + debug_print("SQLite DB Path: " + dbpath) + + collections = booklist.get_collections(collections_attributes) + + with closing(sqlite.connect(dbpath)) as connection: + self.update_device_books(connection, booklist, source_id, plugboard) + self.update_device_collections(connection, booklist, collections, source_id) + + debug_print('PRST1: finished update_device_database') + + def update_device_books(self, connection, booklist, source_id, plugboard): + opts = self.settings() + upload_covers = opts.extra_customization[self.OPT_UPLOAD_COVERS] + refresh_covers = opts.extra_customization[self.OPT_REFRESH_COVERS] + + cursor = connection.cursor() + + # Get existing books + query = 'SELECT file_path, _id FROM books' + cursor.execute(query) + + db_books = {} + for i, row in enumerate(cursor): + lpath = row[0].replace('\\', '/') + db_books[lpath] = row[1] + + for book in booklist: + # Run through plugboard if needed + if plugboard is not None: + newmi = book.deepcopy_metadata() + newmi.template_to_attribute(book, plugboard) + else: + newmi = book + + # Get Metadata We Want + lpath = book.lpath + author = newmi.authors[0] + title = newmi.title + + if lpath not in db_books: + query = ''' + INSERT INTO books + (title, author, source_id, added_date, modified_date, + file_path, file_name, file_size, mime_type, corrupted, + prevent_delete) + values (?,?,?,?,?,?,?,?,?,0,0) + ''' + t = (title, author, source_id, int(time.time() * 1000), + calendar.timegm(book.datetime), lpath, + os.path.basename(book.lpath), book.size, book.mime) + cursor.execute(query, t) + book.bookId = cursor.lastrowid + if upload_covers: + self.upload_book_cover(connection, book, source_id) + debug_print('Inserted New Book: ' + book.title) + else: + query = ''' + UPDATE books + SET title = ?, author = ?, modified_date = ?, file_size = ? + WHERE file_path = ? + ''' + t = (title, author, calendar.timegm(book.datetime), book.size, + lpath) + cursor.execute(query, t) + book.bookId = db_books[lpath] + if refresh_covers: + self.upload_book_cover(connection, book, source_id) + db_books[lpath] = None + + for book, bookId in db_books.items(): + if bookId is not None: + # Remove From Collections + query = 'DELETE FROM collections WHERE content_id = ?' + t = (bookId,) + cursor.execute(query, t) + # Remove from Books + query = 'DELETE FROM books where _id = ?' + t = (bookId,) + cursor.execute(query, t) + debug_print('Deleted Book:' + book) + + connection.commit() + cursor.close() + + def update_device_collections(self, connection, booklist, collections, + source_id): + cursor = connection.cursor() + + if collections: + # Get existing collections + query = 'SELECT _id, title FROM collection' + cursor.execute(query) + + db_collections = {} + for i, row in enumerate(cursor): + db_collections[row[1]] = row[0] + + for collection, books in collections.items(): + if collection not in db_collections: + query = 'INSERT INTO collection (title, source_id) VALUES (?,?)' + t = (collection, source_id) + cursor.execute(query, t) + db_collections[collection] = cursor.lastrowid + debug_print('Inserted New Collection: ' + collection) + + # Get existing books in collection + query = ''' + SELECT books.file_path, content_id + FROM collections + LEFT OUTER JOIN books + WHERE collection_id = ? AND books._id = collections.content_id + ''' + t = (db_collections[collection],) + cursor.execute(query, t) + + db_books = {} + for i, row in enumerate(cursor): + db_books[row[0]] = row[1] + + for idx, book in enumerate(books): + if db_books.get(book.lpath, None) is None: + if collection not in book.device_collections: + book.device_collections.append(collection) + query = ''' + INSERT INTO collections (collection_id, content_id, + added_order) values (?,?,?) + ''' + t = (db_collections[collection], book.bookId, idx) + cursor.execute(query, t) + debug_print('Inserted Book Into Collection: ' + + book.title + ' -> ' + collection) + else: + query = ''' + UPDATE collections + SET added_order = ? + WHERE content_id = ? AND collection_id = ? + ''' + t = (idx, book.bookId, db_collections[collection]) + cursor.execute(query, t) + + db_books[book.lpath] = None + + for bookPath, bookId in db_books.items(): + if bookId is not None: + query = ('DELETE FROM collections ' + 'WHERE content_id = ? AND collection_id = ? ') + t = (bookId, db_collections[collection],) + cursor.execute(query, t) + debug_print('Deleted Book From Collection: ' + bookPath + + ' -> ' + collection) + + db_collections[collection] = None + + for collection, collectionId in db_collections.items(): + if collectionId is not None: + # Remove Books from Collection + query = ('DELETE FROM collections ' + 'WHERE collection_id = ?') + t = (collectionId,) + cursor.execute(query, t) + # Remove Collection + query = ('DELETE FROM collection ' + 'WHERE _id = ?') + t = (collectionId,) + cursor.execute(query, t) + debug_print('Deleted Collection: ' + collection) + + + connection.commit() + cursor.close() + + def rebuild_collections(self, booklist, oncard): + debug_print('PRST1: starting rebuild_collections') + + opts = self.settings() + if opts.extra_customization: + collections = [x.strip() for x in + opts.extra_customization[self.OPT_COLLECTIONS].split(',')] + else: + collections = [] + debug_print('PRST1: collection fields:', collections) + + self.update_device_database(booklist, collections, oncard) + + debug_print('PRS-T1: finished rebuild_collections') + + def upload_book_cover(self, connection, book, source_id): + debug_print('PRST1: Uploading/Refreshing Cover for ' + book.title) + if not book.thumbnail and book.thumbnail[-1]: + return + cursor = connection.cursor() + + thumbnail_path = THUMBPATH%book.bookId + + prefix = self._main_prefix if source_id is 0 else self._card_a_prefix + thumbnail_file_path = os.path.join(prefix, *thumbnail_path.split('/')) + thumbnail_dir_path = os.path.dirname(thumbnail_file_path) + if not os.path.exists(thumbnail_dir_path): + os.makedirs(thumbnail_dir_path) + + with open(thumbnail_file_path, 'wb') as f: + f.write(book.thumbnail[-1]) + + query = 'UPDATE books SET thumbnail = ? WHERE _id = ?' + t = (thumbnail_path, book.bookId,) + cursor.execute(query, t) + + cursor.close() From 64f0f0fd9d02e55ccf4e8da3338e956bc0315ae6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 10 Oct 2011 15:02:14 +0530 Subject: [PATCH 33/58] ... --- src/calibre/library/cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/library/cli.py b/src/calibre/library/cli.py index 29deaa680b..65752eb183 100644 --- a/src/calibre/library/cli.py +++ b/src/calibre/library/cli.py @@ -47,6 +47,9 @@ def get_parser(usage): def get_db(dbpath, options): if options.library_path is not None: dbpath = options.library_path + if dbpath is None: + raise ValueError('No saved library path, either run the GUI or use the' + ' --with-library option') dbpath = os.path.abspath(dbpath) return LibraryDatabase2(dbpath) From b25724a8eaa713797f543b2c3321ba3c382970a8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 10 Oct 2011 16:34:57 +0530 Subject: [PATCH 34/58] Device drivers: Add a prepare_addable_books API method --- src/calibre/devices/interface.py | 6 ++++++ src/calibre/gui2/actions/add.py | 1 + 2 files changed, 7 insertions(+) diff --git a/src/calibre/devices/interface.py b/src/calibre/devices/interface.py index 56c950bd16..ad21632a50 100644 --- a/src/calibre/devices/interface.py +++ b/src/calibre/devices/interface.py @@ -518,3 +518,9 @@ class BookList(list): ''' raise NotImplementedError() + def prepare_addable_books(self, paths): + ''' + Given a list of paths, returns another list of paths. These paths + point to addable versions of the books. + ''' + return paths diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py index 08385f4f3f..dc709f221e 100644 --- a/src/calibre/gui2/actions/add.py +++ b/src/calibre/gui2/actions/add.py @@ -397,6 +397,7 @@ class AddAction(InterfaceAction): d = error_dialog(self.gui, _('Add to library'), _('No book files found')) d.exec_() return + paths = self.gui.device_manager.device.prepare_addable_books(paths) from calibre.gui2.add import Adder self.__adder_func = partial(self._add_from_device_adder, on_card=None, model=view.model()) From 236778f22f7b0d7ea87c9355d2cefe768697d7f1 Mon Sep 17 00:00:00 2001 From: Kolenka Date: Mon, 10 Oct 2011 09:18:43 -0700 Subject: [PATCH 35/58] Sony T1: Tweaks/Bugfixes based on feedback/discovery --- src/calibre/devices/prst1/driver.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 327334aaec..0a5ef4139f 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -265,7 +265,7 @@ class PRST1(USBMS): values (?,?,?,?,?,?,?,?,?,0,0) ''' t = (title, author, source_id, int(time.time() * 1000), - calendar.timegm(book.datetime), lpath, + int(calendar.timegm(book.datetime) * 1000), lpath, os.path.basename(book.lpath), book.size, book.mime) cursor.execute(query, t) book.bookId = cursor.lastrowid @@ -278,7 +278,7 @@ class PRST1(USBMS): SET title = ?, author = ?, modified_date = ?, file_size = ? WHERE file_path = ? ''' - t = (title, author, calendar.timegm(book.datetime), book.size, + t = (title, author, int(calendar.timegm(book.datetime) * 1000), book.size, lpath) cursor.execute(query, t) book.bookId = db_books[lpath] @@ -337,9 +337,9 @@ class PRST1(USBMS): db_books[row[0]] = row[1] for idx, book in enumerate(books): + if collection not in book.device_collections: + book.device_collections.append(collection) if db_books.get(book.lpath, None) is None: - if collection not in book.device_collections: - book.device_collections.append(collection) query = ''' INSERT INTO collections (collection_id, content_id, added_order) values (?,?,?) @@ -424,4 +424,5 @@ class PRST1(USBMS): t = (thumbnail_path, book.bookId,) cursor.execute(query, t) + connection.commit() cursor.close() From 4f0fc544bdcb646b7952b1ac84d83a6070b8e6c7 Mon Sep 17 00:00:00 2001 From: Kolenka Date: Mon, 10 Oct 2011 09:36:04 -0700 Subject: [PATCH 36/58] Sony T1: Ensure books that are resent have their cover refreshed. --- src/calibre/devices/prst1/driver.py | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 0a5ef4139f..ca8e2ae435 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -403,6 +403,38 @@ class PRST1(USBMS): debug_print('PRS-T1: finished rebuild_collections') + def upload_cover(self, path, filename, metadata, filepath): + debug_print('PRS-T1: uploading cover') + + if filepath.startswith(self._main_prefix): + prefix = self._main_prefix + source_id = 0 + else: + prefix = self._card_a_prefix + source_id = 1 + + metadata.lpath = filepath.partition(prefix)[2] + dbpath = self.normalize_path(prefix + DBPATH) + debug_print("SQLite DB Path: " + dbpath) + + with closing(sqlite.connect(dbpath)) as connection: + cursor = connection.cursor() + + query = 'SELECT _id FROM books WHERE file_path = ?' + t = (metadata.lpath,) + cursor.execute(query, t) + + for i, row in enumerate(cursor): + metadata.bookId = row[0] + + cursor.close() + + if metadata.bookId is not None: + debug_print('PRS-T1: refreshing cover for book being sent') + self.upload_book_cover(connection, metadata, source_id) + + debug_print('PRS-T1: done uploading cover') + def upload_book_cover(self, connection, book, source_id): debug_print('PRST1: Uploading/Refreshing Cover for ' + book.title) if not book.thumbnail and book.thumbnail[-1]: From f5ac39d932fa491f930f9aac73ae6117ade0d73e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Oct 2011 04:12:02 +0530 Subject: [PATCH 37/58] Defense News by DM. Fixes #871916 (New recipe for DefenseNews) --- recipes/defensenews.recipe | 64 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 recipes/defensenews.recipe diff --git a/recipes/defensenews.recipe b/recipes/defensenews.recipe new file mode 100644 index 0000000000..8c0f9b0be7 --- /dev/null +++ b/recipes/defensenews.recipe @@ -0,0 +1,64 @@ +__license__ = 'GPL v3' +__copyright__ = '2011, Darko Miletic ' +''' +www.defensenews.com +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class DefenseNews(BasicNewsRecipe): + title = 'Defense News' + __author__ = 'Darko Miletic' + description = 'Find late-breaking defense news from the leading defense news weekly' + publisher = 'Gannett Government Media Corporation' + category = 'defense news, defence news, defense, defence, defence budget, defence policy' + oldest_article = 31 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://www.defensenews.com/images/logo_defensenews2.jpg' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + .info{font-size: small; color: gray} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [ + dict(name=['meta','link']) + ,dict(attrs={'class':['toolbar','related','left','right']}) + ] + remove_tags_before = attrs={'class':'storyWrp'} + remove_tags_after = attrs={'class':'middle'} + + remove_attributes=['lang'] + + feeds = [ + (u'Europe' , u'http://www.defensenews.com/rss/eur/' ) + ,(u'Americas', u'http://www.defensenews.com/rss/ame/' ) + ,(u'Asia & Pacific rim', u'http://www.defensenews.com/rss/asi/' ) + ,(u'Middle east & Africa', u'http://www.defensenews.com/rss/mid/') + ,(u'Air', u'http://www.defensenews.com/rss/air/' ) + ,(u'Land', u'http://www.defensenews.com/rss/lan/' ) + ,(u'Naval', u'http://www.defensenews.com/rss/sea/' ) + ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup From 831d301d802315c2834c720d12fbeadfc3773b90 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Oct 2011 04:18:38 +0530 Subject: [PATCH 38/58] ... --- src/calibre/gui2/preferences/server.ui | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/preferences/server.ui b/src/calibre/gui2/preferences/server.ui index b07a8cac34..be68c2448f 100644 --- a/src/calibre/gui2/preferences/server.ui +++ b/src/calibre/gui2/preferences/server.ui @@ -206,7 +206,7 @@ - Run server &automatically on startup + Run server &automatically when calibre starts From 983a3a76c5f6ae9b5add7544a8b18735bd517f79 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Oct 2011 04:30:57 +0530 Subject: [PATCH 39/58] Merco Press and Penguin news by Russell Phillips --- recipes/merco_press.recipe | 27 +++++++++++++++++++++++++++ recipes/penguin_news.recipe | 17 +++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 recipes/merco_press.recipe create mode 100644 recipes/penguin_news.recipe diff --git a/recipes/merco_press.recipe b/recipes/merco_press.recipe new file mode 100644 index 0000000000..efa2d6ec08 --- /dev/null +++ b/recipes/merco_press.recipe @@ -0,0 +1,27 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class MercoPress(BasicNewsRecipe): + title = u'Merco Press' + description = u"Read News, Stories and Insight Analysis from Latin America and Mercosur. Politics, Economy, Business and Investments in South America." + cover_url = 'http://en.mercopress.com/web/img/en/mercopress-logo.gif' + + __author__ = 'Russell Phillips' + language = 'en' + + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + + extra_css = 'img{padding-bottom:1ex; display:block; text-align: center;}' + remove_tags = [dict(name='a')] + + feeds = [('Antarctica', 'http://en.mercopress.com/rss/antarctica'), + ('Argentina', 'http://en.mercopress.com/rss/argentina'), + ('Brazil', 'http://en.mercopress.com/rss/brazil'), + ('Falkland Islands', 'http://en.mercopress.com/rss/falkland-islands'), + ('International News', 'http://en.mercopress.com/rss/international'), + ('Latin America', 'http://en.mercopress.com/rss/latin-america'), + ('Mercosur', 'http://en.mercopress.com/rss/mercosur'), + ('Paraguay', 'http://en.mercopress.com/rss/paraguay'), + ('United States', 'http://en.mercopress.com/rss/united-states'), + ('Uruguay://en.mercopress.com/rss/uruguay')] diff --git a/recipes/penguin_news.recipe b/recipes/penguin_news.recipe new file mode 100644 index 0000000000..6761623a55 --- /dev/null +++ b/recipes/penguin_news.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class MercoPress(BasicNewsRecipe): + title = u'Penguin News' + description = u"Penguin News: the Falkland Islands' only newspaper." + cover_url = 'http://www.penguin-news.com/templates/rt_syndicate_j15/images/logo/light/logo1.png' + language = 'en' + + __author__ = 'Russell Phillips' + + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + + extra_css = 'img{padding-bottom:1ex; display:block; text-align: center;}' + + feeds = [(u'Penguin News - Falkland Islands', u'http://www.penguin-news.com/index.php?format=feed&type=rss')] From a099448d6ac6398048db364cc176c6f3dbc3c8a9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Oct 2011 07:28:56 +0530 Subject: [PATCH 40/58] Changes to the build process to accomodate my current circumstances --- setup/build_environment.py | 5 ++++- setup/installer/__init__.py | 13 ++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/setup/build_environment.py b/setup/build_environment.py index d4a7af833b..eb34763fb4 100644 --- a/setup/build_environment.py +++ b/setup/build_environment.py @@ -225,7 +225,10 @@ except: try: HOST=get_ip_address('wlan0') except: - HOST='192.168.1.2' + try: + HOST=get_ip_address('ppp0') + except: + HOST='192.168.1.2' PROJECT=os.path.basename(os.path.abspath('.')) diff --git a/setup/installer/__init__.py b/setup/installer/__init__.py index 79bb942cde..8374f93e38 100644 --- a/setup/installer/__init__.py +++ b/setup/installer/__init__.py @@ -20,17 +20,23 @@ for x in [ EXCLUDES.extend(['--exclude', x]) SAFE_EXCLUDES = ['"%s"'%x if '*' in x else x for x in EXCLUDES] +def get_rsync_pw(): + return open('/home/kovid/work/kde/conf/buildbot').read().partition( + ':')[-1].strip() + class Rsync(Command): description = 'Sync source tree from development machine' SYNC_CMD = ' '.join(BASE_RSYNC+SAFE_EXCLUDES+ - ['rsync://{host}/work/{project}', '..']) + ['rsync://buildbot@{host}/work/{project}', '..']) def run(self, opts): cmd = self.SYNC_CMD.format(host=HOST, project=PROJECT) + env = dict(os.environ) + env['RSYNC_PASSWORD'] = get_rsync_pw() self.info(cmd) - subprocess.check_call(cmd, shell=True) + subprocess.check_call(cmd, shell=True, env=env) class Push(Command): @@ -81,7 +87,8 @@ class VMInstaller(Command): def get_build_script(self): - ans = '\n'.join(self.BUILD_PREFIX)+'\n\n' + rs = ['export RSYNC_PASSWORD=%s'%get_rsync_pw()] + ans = '\n'.join(self.BUILD_PREFIX + rs)+'\n\n' ans += ' && \\\n'.join(self.BUILD_RSYNC) + ' && \\\n' ans += ' && \\\n'.join(self.BUILD_CLEAN) + ' && \\\n' ans += ' && \\\n'.join(self.BUILD_BUILD) + ' && \\\n' From 96fdd1799f39c83786289593ac194791f6779173 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Oct 2011 09:07:38 +0530 Subject: [PATCH 41/58] WoW Insider by Krittika Goyal --- recipes/wow.recipe | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 recipes/wow.recipe diff --git a/recipes/wow.recipe b/recipes/wow.recipe new file mode 100644 index 0000000000..9024f8eaf4 --- /dev/null +++ b/recipes/wow.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class WoW(BasicNewsRecipe): + title = u'WoW Insider' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True + + feeds = [ +('WoW', + 'http://wow.joystiq.com/rss.xml') +] From f21132e16d3d7f5ea570611ef3bf776f03d6fec1 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Tue, 11 Oct 2011 09:39:38 +0200 Subject: [PATCH 42/58] Fix for the problem where setting the restriction to an empty current search clears the restriction box but does not clear the restriction. --- src/calibre/gui2/search_restriction_mixin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/gui2/search_restriction_mixin.py b/src/calibre/gui2/search_restriction_mixin.py index ffebc9e131..1319f8d17d 100644 --- a/src/calibre/gui2/search_restriction_mixin.py +++ b/src/calibre/gui2/search_restriction_mixin.py @@ -37,6 +37,7 @@ class SearchRestrictionMixin(object): search = unicode(search) if not search: self.search_restriction.setCurrentIndex(0) + self._apply_search_restriction('') else: s = '*' + search if self.search_restriction.count() > 1: From 430f67c3f5a1156afa936ce12ea7f4dd1fd696dd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Oct 2011 17:26:51 +0530 Subject: [PATCH 43/58] ... --- src/calibre/devices/prst1/driver.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index ca8e2ae435..22ef567280 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -253,8 +253,11 @@ class PRST1(USBMS): # Get Metadata We Want lpath = book.lpath - author = newmi.authors[0] - title = newmi.title + try: + author = newmi.authors[0] + except: + author = _('Unknown') + title = newmi.title or _('Unknown') if lpath not in db_books: query = ''' @@ -405,34 +408,34 @@ class PRST1(USBMS): def upload_cover(self, path, filename, metadata, filepath): debug_print('PRS-T1: uploading cover') - + if filepath.startswith(self._main_prefix): prefix = self._main_prefix source_id = 0 else: prefix = self._card_a_prefix source_id = 1 - + metadata.lpath = filepath.partition(prefix)[2] dbpath = self.normalize_path(prefix + DBPATH) debug_print("SQLite DB Path: " + dbpath) - with closing(sqlite.connect(dbpath)) as connection: + with closing(sqlite.connect(dbpath)) as connection: cursor = connection.cursor() - + query = 'SELECT _id FROM books WHERE file_path = ?' t = (metadata.lpath,) cursor.execute(query, t) - + for i, row in enumerate(cursor): metadata.bookId = row[0] - + cursor.close() - + if metadata.bookId is not None: debug_print('PRS-T1: refreshing cover for book being sent') self.upload_book_cover(connection, metadata, source_id) - + debug_print('PRS-T1: done uploading cover') def upload_book_cover(self, connection, book, source_id): From 967ec83a7cbfa8991f9a060e07b7e3165cc48cc6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Oct 2011 17:44:25 +0530 Subject: [PATCH 44/58] ... --- recipes/guardian.recipe | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index 05d6616ace..f063934b3d 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -119,10 +119,8 @@ class Guardian(BasicNewsRecipe): } def parse_index(self): - try: - feeds = [] - for title, href in self.find_sections(): - feeds.append((title, list(self.find_articles(href)))) - return feeds - except: - raise NotImplementedError + feeds = [] + for title, href in self.find_sections(): + feeds.append((title, list(self.find_articles(href)))) + return feeds + From bffa00773d14f5cffec23f685a5fdd1642614201 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 11 Oct 2011 18:52:41 +0530 Subject: [PATCH 45/58] T1 driver ad detection of SD card on windows --- src/calibre/devices/prst1/driver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 22ef567280..44c93af4cc 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -47,6 +47,9 @@ class PRST1(USBMS): WINDOWS_MAIN_MEM = re.compile( r'(PRS-T1&)' ) + WINDOWS_CARD_A_MEM = re.compile( + r'(PRS-T1__SD&)' + ) MAIN_MEMORY_VOLUME_LABEL = 'SONY Reader Main Memory' STORAGE_CARD_VOLUME_LABEL = 'SONY Reader Storage Card' From 79f9adee92cad6df84e5cee10cc91c7c5547af44 Mon Sep 17 00:00:00 2001 From: Kolenka Date: Tue, 11 Oct 2011 10:18:09 -0700 Subject: [PATCH 46/58] Sony T1: Fix typo in if statement when uploading a cover --- src/calibre/devices/prst1/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 44c93af4cc..eeb73da182 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -443,7 +443,7 @@ class PRST1(USBMS): def upload_book_cover(self, connection, book, source_id): debug_print('PRST1: Uploading/Refreshing Cover for ' + book.title) - if not book.thumbnail and book.thumbnail[-1]: + if not book.thumbnail or not book.thumbnail[-1]: return cursor = connection.cursor() From 7519a43d8f6ada651f3633aaea20eb122c3b39f6 Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Tue, 11 Oct 2011 22:16:53 +0200 Subject: [PATCH 47/58] smtp: add Date header as required for rfc5322 this prevents from bad header classifications in mail filters --- src/calibre/utils/smtp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/utils/smtp.py b/src/calibre/utils/smtp.py index 2887f985a4..1702aaea5c 100644 --- a/src/calibre/utils/smtp.py +++ b/src/calibre/utils/smtp.py @@ -18,11 +18,13 @@ def create_mail(from_, to, subject, text=None, attachment_data=None, assert text or attachment_data from email.mime.multipart import MIMEMultipart + from email.utils import formatdate outer = MIMEMultipart() outer['Subject'] = subject outer['To'] = to outer['From'] = from_ + outer['Date'] = formatdate() outer.preamble = 'You will not see this in a MIME-aware mail reader.\n' if text is not None: From 9ccae653feef7c610c70814b3dea017de54c1cd3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Oct 2011 04:44:05 +0530 Subject: [PATCH 48/58] Fix #872447 (Updated geek and poke recipe) --- recipes/geek_poke.recipe | 58 ++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/recipes/geek_poke.recipe b/recipes/geek_poke.recipe index 8fa9e7ca29..7f5117b586 100644 --- a/recipes/geek_poke.recipe +++ b/recipes/geek_poke.recipe @@ -1,35 +1,71 @@ -#!/usr/bin/python - from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.utils.magick import Image class AdvancedUserRecipe1307556816(BasicNewsRecipe): title = u'Geek and Poke' __author__ = u'DrMerry' description = u'Geek and Poke Cartoons' + publisher = u'Oliver Widder' + author = u'Oliver Widder, DrMerry (calibre-code), calibre' oldest_article = 31 max_articles_per_feed = 100 language = u'en' simultaneous_downloads = 5 #delay = 1 - timefmt = ' [%A, %d %B, %Y]' + timefmt = ' [%a, %d %B, %Y]' summary_length = -1 no_stylesheets = True + category = 'News.IT, Cartoon, Humor, Geek' + use_embedded_content = False cover_url = 'http://geekandpoke.typepad.com/aboutcoders.jpeg' remove_javascript = True remove_empty_feeds = True publication_type = 'blog' + conversion_options = { + 'comments' : '' + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'author' : author + } - preprocess_regexps = [ (re.compile(r'(

 

||]*>Tweet|]*>|)', re.DOTALL|re.IGNORECASE),lambda match: ''), - (re.compile(r'( | )', re.DOTALL|re.IGNORECASE),lambda match: ' '), - (re.compile(r'()+', re.DOTALL|re.IGNORECASE),lambda match: '
') - ] + remove_tags_before = dict(name='p', attrs={'class':'content-nav'}) + remove_tags_after = dict(name='div', attrs={'class':'entry-content'}) + remove_tags = [dict(name='div', attrs={'class':'entry-footer'}), + dict(name='div', attrs={'id':'alpha'}), + dict(name='div', attrs={'id':'gamma'}), + dict(name='iframe'), + dict(name='p', attrs={'class':'content-nav'})] - extra_css = 'body, h3, p, h2, h1, div, span{margin:0px} h2.date-header {font-size: 0.7em; color:#eee;} h3.entry-header{font-size: 1.0em} div.entry-body{font-size: 0.9em}' + filter_regexps = [(r'feedburner\.com'), + (r'pixel.quantserve\.com'), + (r'googlesyndication\.com'), + (r'yimg\.com'), + (r'scorecardresearch\.com')] + preprocess_regexps = [(re.compile(r'(

( |\s)*

|]*>Tweet|]*>|)', re.DOTALL|re.IGNORECASE),lambda match: ''), + (re.compile(r'( |\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '), + (re.compile(r']*>([^<]*)

[^>]*(]*>)', re.DOTALL|re.IGNORECASE), lambda match: match.group(2) + '
' + match.group(1) + '
'), + (re.compile(r'(]*>)]>((?!', re.DOTALL|re.IGNORECASE),lambda match: match.group(1) + match.group(2) + ''), + (re.compile(r'(]*alt="([^"]*)"[^>]*>)', re.DOTALL|re.IGNORECASE),lambda match: match.group(1) + '
' + match.group(2) + ''), + (re.compile(r'()+', re.DOTALL|re.IGNORECASE),lambda match: '
'), + (re.compile(r'', re.DOTALL), lambda m: '') + ] - remove_tags_before = dict(name='h2', attrs={'class':'date-header'}) - remove_tags_after = dict(name='div', attrs={'class':'entry-body'}) + extra_css = 'body, h3, p, #MERRYdate, h1, div, span{margin:0px; padding:0px} h3.entry-header{font-size: 0.8em} div.entry-body{font-size: 0.7em} #MERRYdate {font-size: 0.5em}' + def postprocess_html(self, soup, first): + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + #print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + img.trim(0) + img.save(iurl) + width, height = img.size + #print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + return soup - feeds = [(u'Geek and Poke', u'http://feeds.feedburner.com/GeekAndPoke?format=xml')] + feeds = ['http://feeds.feedburner.com/GeekAndPoke?format=xml'] From a5be334092487c74b866d02f99dd7188a03f15ea Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Oct 2011 04:45:20 +0530 Subject: [PATCH 49/58] Fix #872385 (Sony T1: Typo in upload_cover) --- src/calibre/devices/prst1/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 44c93af4cc..eeb73da182 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -443,7 +443,7 @@ class PRST1(USBMS): def upload_book_cover(self, connection, book, source_id): debug_print('PRST1: Uploading/Refreshing Cover for ' + book.title) - if not book.thumbnail and book.thumbnail[-1]: + if not book.thumbnail or not book.thumbnail[-1]: return cursor = connection.cursor() From af3d7c4a9ccf00305683cf9bc28874d08455b673 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Oct 2011 06:01:16 +0530 Subject: [PATCH 50/58] ... --- src/calibre/devices/prst1/driver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index eeb73da182..02913b0bcc 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -1,5 +1,7 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' From f2248830dfed48376c9e83eba90778852407b063 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Oct 2011 06:01:51 +0530 Subject: [PATCH 51/58] Fix #871388 (Conversion: add support for ) --- src/calibre/ebooks/oeb/transforms/flatcss.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 474bb0eb32..6f338cb6d1 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -246,6 +246,7 @@ class CSSFlattener(object): cssdict['font-size'] = '%.1fpt'%font_size del node.attrib['size'] if 'face' in node.attrib: + cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: cssdict['color'] = node.attrib['color'] From fd2df7f5a44b350cc172fc61f106489c654c452d Mon Sep 17 00:00:00 2001 From: Kolenka Date: Tue, 11 Oct 2011 19:33:40 -0700 Subject: [PATCH 52/58] Sony T1: Bugfixes + Author Sort Ensure path is correct on Windows before updating database Ensure modification time is being used on Windows Enable "Use Author Sort" for the T1 --- src/calibre/devices/prst1/driver.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index eeb73da182..8e46541996 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -55,6 +55,7 @@ class PRST1(USBMS): THUMBNAIL_HEIGHT = 144 SUPPORTS_SUB_DIRS = True + SUPPORTS_USE_AUTHOR_SORT = True MUST_READ_METADATA = True EBOOK_DIR_MAIN = 'Sony_Reader/media/books' @@ -255,12 +256,20 @@ class PRST1(USBMS): newmi = book # Get Metadata We Want - lpath = book.lpath + # Make sure lpath uses Unix-style strings + lpath = book.lpath.replace('\\', '/') try: - author = newmi.authors[0] + if opts.use_author_sort: + if newmi.author_sort : + author = newmi.author_sort + else: + author = authors_to_sort_string(newmi.authors) + else: + author = newmi.authors[0] except: author = _('Unknown') title = newmi.title or _('Unknown') + modified_date = os.path.getmtime(book.path) * 1000 if lpath not in db_books: query = ''' @@ -271,8 +280,8 @@ class PRST1(USBMS): values (?,?,?,?,?,?,?,?,?,0,0) ''' t = (title, author, source_id, int(time.time() * 1000), - int(calendar.timegm(book.datetime) * 1000), lpath, - os.path.basename(book.lpath), book.size, book.mime) + modified_date, lpath, + os.path.basename(lpath), book.size, book.mime) cursor.execute(query, t) book.bookId = cursor.lastrowid if upload_covers: @@ -284,7 +293,7 @@ class PRST1(USBMS): SET title = ?, author = ?, modified_date = ?, file_size = ? WHERE file_path = ? ''' - t = (title, author, int(calendar.timegm(book.datetime) * 1000), book.size, + t = (title, author, modified_date, book.size, lpath) cursor.execute(query, t) book.bookId = db_books[lpath] From 342d189f87febd19c9ad2b6b6196d4a68d0776c5 Mon Sep 17 00:00:00 2001 From: Kolenka Date: Tue, 11 Oct 2011 19:37:06 -0700 Subject: [PATCH 53/58] Sony T1: Cleanup --- src/calibre/devices/prst1/driver.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 8e46541996..5376eeb87f 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -293,8 +293,7 @@ class PRST1(USBMS): SET title = ?, author = ?, modified_date = ?, file_size = ? WHERE file_path = ? ''' - t = (title, author, modified_date, book.size, - lpath) + t = (title, author, modified_date, book.size, lpath) cursor.execute(query, t) book.bookId = db_books[lpath] if refresh_covers: From b6a7aa34f344cfa4cbd1dcaa96b1b96c731b0368 Mon Sep 17 00:00:00 2001 From: Kolenka Date: Tue, 11 Oct 2011 19:38:01 -0700 Subject: [PATCH 54/58] Sony T1: Cleanup --- src/calibre/devices/prst1/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 5376eeb87f..592dbd8552 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -256,7 +256,7 @@ class PRST1(USBMS): newmi = book # Get Metadata We Want - # Make sure lpath uses Unix-style strings + # Make sure lpath uses Unix-style strings lpath = book.lpath.replace('\\', '/') try: if opts.use_author_sort: From 9b56902017d0a334be6148f87f42fee51e1d9d6b Mon Sep 17 00:00:00 2001 From: Kolenka Date: Tue, 11 Oct 2011 19:59:44 -0700 Subject: [PATCH 55/58] Sony T1: Use localtime timestamp for modified_date --- src/calibre/devices/prst1/driver.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 592dbd8552..578a9bc65c 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -269,7 +269,11 @@ class PRST1(USBMS): except: author = _('Unknown') title = newmi.title or _('Unknown') - modified_date = os.path.getmtime(book.path) * 1000 + + # Get modified date + modified_date = os.path.getmtime(book.path) + time_offset = time.altzone if time.daylight else time.timezone + modified_date = (modified_date - time_offset) * 1000 if lpath not in db_books: query = ''' From 075ca4e603c9aa2b2b64b9f38fb3d40320e34d57 Mon Sep 17 00:00:00 2001 From: Kolenka Date: Tue, 11 Oct 2011 21:09:17 -0700 Subject: [PATCH 56/58] Sony T1: Fix behavior which caused Windows path seps to leak into Calibre --- src/calibre/devices/prst1/driver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index 578a9bc65c..3d71bae323 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -256,8 +256,7 @@ class PRST1(USBMS): newmi = book # Get Metadata We Want - # Make sure lpath uses Unix-style strings - lpath = book.lpath.replace('\\', '/') + lpath = book.lpath try: if opts.use_author_sort: if newmi.author_sort : @@ -432,6 +431,7 @@ class PRST1(USBMS): source_id = 1 metadata.lpath = filepath.partition(prefix)[2] + metadata.lpath = metadata.lpath.replace('\\', '/') dbpath = self.normalize_path(prefix + DBPATH) debug_print("SQLite DB Path: " + dbpath) From 45de2bde56867eba0fb127b0ff5fe665cd49962f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Oct 2011 11:33:01 +0530 Subject: [PATCH 57/58] ... --- src/calibre/devices/prst1/driver.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/prst1/driver.py b/src/calibre/devices/prst1/driver.py index a3e3e8d0a0..eb64a3c15a 100644 --- a/src/calibre/devices/prst1/driver.py +++ b/src/calibre/devices/prst1/driver.py @@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en' Device driver for the SONY T1 devices ''' -import os, time, calendar, re +import os, time, re import sqlite3 as sqlite from contextlib import closing @@ -19,6 +19,7 @@ from calibre.devices.usbms.driver import USBMS, debug_print from calibre.devices.usbms.device import USBDevice from calibre.devices.usbms.books import CollectionsBookList from calibre.devices.usbms.books import BookList +from calibre.ebooks.metadata import authors_to_sort_string from calibre.constants import islinux DBPATH = 'Sony_Reader/database/books.db' @@ -261,7 +262,7 @@ class PRST1(USBMS): lpath = book.lpath try: if opts.use_author_sort: - if newmi.author_sort : + if newmi.author_sort: author = newmi.author_sort else: author = authors_to_sort_string(newmi.authors) @@ -449,7 +450,7 @@ class PRST1(USBMS): cursor.close() - if metadata.bookId is not None: + if getattr(metadata, 'bookId', None) is not None: debug_print('PRS-T1: refreshing cover for book being sent') self.upload_book_cover(connection, metadata, source_id) From cadbff1290439a5c4994ea47a0a8aa0bdd58dae0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Oct 2011 20:31:43 +0530 Subject: [PATCH 58/58] Fix #872875 (dilbert update) --- recipes/dilbert.recipe | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recipes/dilbert.recipe b/recipes/dilbert.recipe index 56aa4af8c9..ed2771debf 100644 --- a/recipes/dilbert.recipe +++ b/recipes/dilbert.recipe @@ -2,6 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' ''' http://www.dilbert.com +DrMerry added cover Image 2011-11-12 ''' from calibre.web.feeds.recipes import BasicNewsRecipe @@ -9,7 +10,7 @@ import re class DilbertBig(BasicNewsRecipe): title = 'Dilbert' - __author__ = 'Darko Miletic and Starson17' + __author__ = 'Darko Miletic and Starson17 contribution of DrMerry' description = 'Dilbert' reverse_article_order = True oldest_article = 15 @@ -20,6 +21,7 @@ class DilbertBig(BasicNewsRecipe): publisher = 'UNITED FEATURE SYNDICATE, INC.' category = 'comic' language = 'en' + cover_url = 'http://dilbert.com/mobile/mobile/dilbert.app.icon.png' conversion_options = { 'comments' : description