Sync to trunk.

2025-07-09 03:04:10 -04:00 · 2010-01-10 08:41:29 -05:00 · 2010-01-10 08:41:29 -05:00 · 5f5fd2a2a8
commit 5f5fd2a2a8
parent 77d4af5d31 8caf640ca9
49 changed files with 23573 additions and 12607 deletions
--- a/resources/recipes/dallas.recipe
+++ b/resources/recipes/dallas.recipe
@ -0,0 +1,28 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class DallasNews(BasicNewsRecipe):
    title          = u'The Dallas Morning News'
    language       = 'en'
    oldest_article = 2 #days
    max_articles_per_feed = 25
    no_stylesheets = True
    remove_tags_before = dict(name='h2', attrs={'class':'vitstoryheadline'})
    remove_tags_after  = dict(name='div', attrs={'style':'width: 100%; clear: right'})
    remove_tags_after  = dict(name='div', attrs={'id':'article_tools_bottom'})
    remove_tags = [
       dict(name='iframe'),
       dict(name='div', attrs={'class':'biblockmore'}),
       dict(name='div', attrs={'style':'width: 100%; clear: right'}),
       dict(name='div', attrs={'id':'article_tools_bottom'}),
       #dict(name='ul', attrs={'class':'articleTools'}),
    ]
    feeds          = [
                      ('Latest News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslatestnews.xml'),
                      ('Local News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslocalnews.xml'),
 		      ('Nation and World', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml'),
 		      ('Politics', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml'),
 		      ('Science', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsscience.xml'),
                    ]
--- a/resources/recipes/economist.recipe
+++ b/resources/recipes/economist.recipe
@ -7,6 +7,7 @@ economist.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.BeautifulSoup import Tag, NavigableString
 import mechanize, string, urllib, time
@ -103,3 +104,22 @@ class Economist(BasicNewsRecipe):
        if not ans:
            raise Exception('Could not find any articles. Has your subscription expired?')
        return ans
    def eco_find_image_tables(self, soup):
        for x in soup.findAll('table', align='right'):
            if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
                yield x
    def postprocess_html(self, soup, first):
        for table in list(self.eco_find_image_tables(soup)):
            caption = table.find('font')
            img = table.find('img')
            div = Tag(soup, 'div')
            div['style'] = 'text-align:center;font-size:70%'
            ns = NavigableString(self.tag_to_string(caption))
            div.insert(0, ns)
            div.insert(1, Tag(soup, 'br'))
            img.extract()
            div.insert(2, img)
            table.replaceWith(div)
        return soup
--- a/resources/recipes/economist_free.recipe
+++ b/resources/recipes/economist_free.recipe
@ -1,6 +1,7 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.threadpool import ThreadPool, makeRequests
-import time
+from calibre.ebooks.BeautifulSoup import Tag, NavigableString
 import time, string
 from datetime import datetime
 from lxml import html
@ -48,7 +49,30 @@ class Economist(BasicNewsRecipe):
        for r in requests: pool.putRequest(r)
        pool.wait()
-        return [(t, a) for t, a in self.feed_dict.items()]
+        return self.eco_sort_sections([(t, a) for t, a in
            self.feed_dict.items()])
    def eco_sort_sections(self, feeds):
        order = {
            'The World This Week': 1,
            'Leaders': 2,
            'Letters': 3,
            'Briefing': 4,
            'Business': 5,
            'Finance And Economics': 6,
            'Science & Technology': 7,
            'Books & Arts': 8,
            'International': 9,
            'United States': 10,
            'Asia': 11,
            'Europe': 12,
            'The Americas': 13,
            'Middle East & Africa': 14,
            'Britain': 15,
            'Obituary': 16,
        }
        return sorted(feeds, cmp=lambda x,y:cmp(order.get(x[0], 100),
            order.get(y[0], 100)))
    def process_eco_feed_article(self, args):
        from calibre import browser
@ -61,8 +85,8 @@ class Economist(BasicNewsRecipe):
        matches = root.xpath('//*[@class = "article-section"]')
        feedtitle = 'Miscellaneous'
        if matches:
-            feedtitle = html.tostring(matches[0], method='text',
+            feedtitle = string.capwords(html.tostring(matches[0], method='text',
-                    encoding=unicode)
+                    encoding=unicode))
        return (i, feedtitle, url, title, description, author, published)
    def eco_article_found(self, req, result):
@ -81,3 +105,22 @@ class Economist(BasicNewsRecipe):
    def eco_article_failed(self, req, tb):
        self.log.error('Failed to download %s with error:'%req.args[0][2])
        self.log.debug(tb)
    def eco_find_image_tables(self, soup):
        for x in soup.findAll('table', align='right'):
            if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
                yield x
    def postprocess_html(self, soup, first):
        for table in list(self.eco_find_image_tables(soup)):
            caption = table.find('font')
            img = table.find('img')
            div = Tag(soup, 'div')
            div['style'] = 'text-align:center;font-size:70%'
            ns = NavigableString(self.tag_to_string(caption))
            div.insert(0, ns)
            div.insert(1, Tag(soup, 'br'))
            img.extract()
            div.insert(2, img)
            table.replaceWith(div)
        return soup
--- a/resources/recipes/freenature.recipe
+++ b/resources/recipes/freenature.recipe
@ -1,5 +1,4 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 import re
 class NatureNews(BasicNewsRecipe):
@ -30,15 +29,3 @@ class NatureNews(BasicNewsRecipe):
    def get_article_url(self, article):
        return article.get('id')
    #def preprocess_html(self, soup):
        #story = soup.find(name='div', attrs={'id':'contentColumn'})
        #td = heading.findParent(name='td')
        #td.extract()
        #soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        #body = soup.find(name='body')
        #body.insert(0, story)
        #for x in soup.findAll(name='p', text=lambda x:x and '--&gt;' in x):
             #p = x.findParent('p')
             #if p is not None:
                  #p.extract()
        #return soup
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -27,7 +27,7 @@ class NYTimes(BasicNewsRecipe):
                       'side_tool', 'side_index',
                       'relatedArticles', 'relatedTopics', 'adxSponLink']),
                   dict(name=['script', 'noscript', 'style'])]
-    encoding = 'cp1252'
+    #encoding = 'cp1252'
    no_stylesheets = True
    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
@ -118,5 +118,5 @@ class NYTimes(BasicNewsRecipe):
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
-        raw = self.browser.open('http://www.nytimes.com'+content).read()
+        raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))
--- a/resources/recipes/nzherald.recipe
+++ b/resources/recipes/nzherald.recipe
@ -0,0 +1,73 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class NewZealandHerald(BasicNewsRecipe):
    title       = 'New Zealand Herald'
    __author__  = 'Krittika Goyal'
    description = 'Daily news'
    timefmt = ' [%d %b, %Y]'
    no_stylesheets = True
    remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'})
    remove_tags_after  = dict(name='div', attrs={'class':'callToAction'})
    remove_tags = [
       dict(name='iframe'),
       dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}),
       #dict(name='div', attrs={'id':['shareContainer']}),
       #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}),
       #dict(name='table', attrs={'cellspacing':'0'}),
    ]
    def preprocess_html(self, soup):
        table = soup.find('table')
        if table is not None:
            table.extract()
        return soup
    #TO GET ARTICLES IN SECTION
    def nz_parse_section(self, url):
            soup = self.index_to_soup(url)
            div = soup.find(attrs={'class':'col-300 categoryList'})
            date = div.find(attrs={'class':'link-list-heading'})
            current_articles = []
            for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}):
                if x.get('class') == 'link-list-heading': break
                for li in x.findAll('li'):
                    a = li.find('a', href=True)
                    if a is None:
                        continue
                    title = self.tag_to_string(a)
                    url = a.get('href', False)
                    if not url or not title:
                        continue
                    if url.startswith('/'):
                         url = 'http://www.nzherald.co.nz'+url
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url':url,
                        'description':'', 'date':''})
            return current_articles
    # To GET SECTIONS
    def parse_index(self):
            feeds = []
            for title, url in [
                ('National',
                 'http://www.nzherald.co.nz/nz/news/headlines.cfm?c_id=1'),
                ('World',
                 'http://www.nzherald.co.nz/world/news/headlines.cfm?c_id=2'),
                ('Politics',
                 'http://www.nzherald.co.nz/politics/news/headlines.cfm?c_id=280'),
                ('Crime',
                 'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'),
                ('Environment',
                 'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'),
             ]:
               articles = self.nz_parse_section(url)
               if articles:
                   feeds.append((title, articles))
            return feeds
--- a/resources/recipes/sg_hu.recipe
+++ b/resources/recipes/sg_hu.recipe
@ -0,0 +1,16 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class SGhu(BasicNewsRecipe):
    title          = u'SG.hu'
    __author__     = 'davotibarna'
    description    = u'Informatika \xe9s Tudom\xe1ny'
    language = 'hu'
    oldest_article = 5
    max_articles_per_feed = 100
    no_stylesheets = True
    encoding = 'ISO-8859-2'
    feeds          = [(u'SG.hu', u'http://www.sg.hu/plain/rss.xml')]
    def print_version(self, url):
        return url.replace('cikkek/', 'printer.php?cid=')
--- a/resources/recipes/the_escapist.recipe
+++ b/resources/recipes/the_escapist.recipe
@ -0,0 +1,59 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini'
 __copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
 description   = 'the Escapist Magazine - v1.02 (09, January 2010)'
 '''
 http://www.escapistmagazine.com/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class al(BasicNewsRecipe):
    author        = 'Lorenzo Vigentini'
    description   = 'the Escapist Magazine'
    cover_url      = 'http://cdn.themis-media.com/themes/escapistmagazine/default/images/logo.png'
    title          = u'the Escapist Magazine'
    publisher      = 'Themis media'
    category       = 'Video games news, lifestyle, gaming culture'
    language       = 'en'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 1
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10
    remove_javascript     = True
    no_stylesheets        = True
    feeds          = [
                       (u'Daily News', u'http://www.escapistmagazine.com/rss/news/0.xml'),
                       (u'Articles', u'http://www.escapistmagazine.com/rss/articles/0.xml')
                     ]
    def print_version(self,url):
        baseURL='http://www.escapistmagazine.com'
        segments = url.split('/')
        #basename = '/'.join(segments[:3]) + '/'
        subPath= '/'+ segments[3] + '/'
        articleURL=(segments[len(segments)-1])[0:5]
        if articleURL[4] =='-':
            articleURL=articleURL[:4]
        printVerString='print/'+ articleURL
        s=  baseURL + subPath + printVerString
        return s
    keep_only_tags     = [
                            dict(name='div', attrs={'id':'article'})
                        ]
    remove_tags        = [
                            dict(name='div',attrs={'id':['ad_leaderboard','print_notice','bottom_panel_container']})
                         ]
--- a/setup/installer/init.py
+++ b/setup/installer/init.py
@ -111,6 +111,7 @@ class VMInstaller(Command):
            self.vm = self.VM
        if not self.vmware_started():
            self.start_vmware()
        subprocess.call(['chmod', '-R', '+r', 'resources/recipes'])
        self.start_vm()
        self.download_installer()
        if not self.dont_shutdown:
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -416,6 +416,7 @@ class NookOutput(OutputProfile):
    # Screen size is a best guess
    screen_size               = (600, 730)
    comic_screen_size         = (584, 730)
    dpi                       = 167
    fbase                     = 16
    fsizes                    = [12, 12, 14, 16, 18, 20, 22, 24]
--- a/src/calibre/devices/prs505/books.py
+++ b/src/calibre/devices/prs505/books.py
@ -187,7 +187,7 @@ class BookList(_BookList):
            self.remove_book(name)
        node = self.document.createElement(self.prefix + "text")
-        mime = MIME_MAP[name.rpartition('.')[-1].lower()]
+        mime = MIME_MAP.get(name.rpartition('.')[-1].lower(), MIME_MAP['epub'])
        cid = self.max_id()+1
        try:
            sourceid = str(self[0].sourceid) if len(self) else '1'
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@ -56,7 +56,7 @@ class PRS505(CLI, Device):
    EBOOK_DIR_MAIN = 'database/media/books'
    EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of metadata fields '
-            'to turn into collections on the device. Posiibilities include: ')+\
+            'to turn into collections on the device. Possibilities include: ')+\
                    'series, tags, authors'
    EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['series', 'tags'])
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@ -117,9 +117,10 @@ class FB2MLizer(object):
        '<book-title>%s</book-title> ' \
        '</title-info><document-info> ' \
        '<program-used>%s - %s</program-used></document-info>\n' \
-        '</description>\n<body>\n<section>' % (author_first, author_middle,
+        '</description>\n<body>\n<section>' % tuple(map(prepare_string_for_xml,
            (author_first, author_middle,
            author_last, self.oeb_book.metadata.title[0].value,
-            __appname__, __version__)
+            __appname__, __version__)))
    def get_cover_page(self):
        output = u''
--- a/src/calibre/ebooks/markdown/mdx_toc.py
+++ b/src/calibre/ebooks/markdown/mdx_toc.py
@ -44,13 +44,13 @@ class TocExtension (markdown.Extension):
        replaces first string occurence of "///Table of Contents Goes Here///"
    """
-    def __init__ (self) :
+    def __init__ (self, configs={}) :
        #maybe add these as parameters to the class init?
        self.TOC_INCLUDE_MARKER = "///Table of Contents///"
        self.TOC_TITLE = "Table Of Contents"
        self.auto_toc_heading_type=2
        self.toc_heading_type=3
-
+        self.configs = configs
    def extendMarkdown(self, md, md_globals) :
        # Just insert in the end
@ -148,16 +148,22 @@ class TocPostprocessor (markdown.Postprocessor):
    def run(self, doc):
        tocPlaceholder = self.toc.findTocPlaceholder(doc)
-        tocDiv = self.toc.createTocDiv(doc)
+        if self.toc.configs.get("disable_toc", False):
-        if tocDiv:
+            if tocPlaceholder:
-            if tocPlaceholder :
+                tocPlaceholder.parent.replaceChild(tocPlaceholder, "")
-                # Replace "magic" pattern with toc
+        else:
-                tocPlaceholder.parent.replaceChild(tocPlaceholder, tocDiv)
+
-            else :
+            tocDiv = self.toc.createTocDiv(doc)
-                # Dump at the end of the DOM
+
-                # Probably want to use CSS to position div
+            if tocDiv:
-                doc.documentElement.appendChild(tocDiv)
+                if tocPlaceholder :
                    # Replace "magic" pattern with toc
                    tocPlaceholder.parent.replaceChild(tocPlaceholder, tocDiv)
                else :
                    # Dump at the end of the DOM
                    # Probably want to use CSS to position div
                    doc.documentElement.appendChild(tocDiv)
-def makeExtension(configs=None) :
+def makeExtension(configs={}):
-    return TocExtension()
+    return TocExtension(configs=configs)
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -468,6 +468,7 @@ class MobiReader(object):
        self.processed_html = self.processed_html.replace('\r\n', '\n')
        self.processed_html = self.processed_html.replace('> <', '>\n<')
        self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
        self.processed_html = re.sub(r'<?xml[^>]*>', '', self.processed_html)
    def remove_random_bytes(self, html):
        return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',
@ -490,6 +491,8 @@ class MobiReader(object):
            'xx-large': '6',
            }
        mobi_version = self.book_header.mobi_version
        for x in root.xpath('//ncx'):
            x.getparent().remove(x)
        for i, tag in enumerate(root.iter(etree.Element)):
            tag.attrib.pop('xmlns', '')
            for x in tag.attrib:
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -960,7 +960,7 @@ class Manifest(object):
            else:
                title = _('Unknown')
-            return self._parse_xhtml(convert_markdown(data, title))
+            return self._parse_xhtml(convert_markdown(data, title=title))
        def _parse_css(self, data):
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -174,7 +174,8 @@ class EbookIterator(object):
                plumber.opts, plumber.input_fmt, self.log,
                {}, self.base)
-        if processed or plumber.input_fmt.lower() in ('pdf', 'rb'):
+        if processed or plumber.input_fmt.lower() in ('pdf', 'rb') and \
                not hasattr(self.pathtoopf, 'manifest'):
            self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
                    plumber.input_plugin)
        if hasattr(self.pathtoopf, 'manifest'):
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@ -15,7 +15,7 @@ pdfreflow, pdfreflow_err = plugins['pdfreflow']
 class PDFInput(InputFormatPlugin):
    name        = 'PDF Input'
-    author      = 'John Schember'
+    author      = 'Kovid Goyal and John Schember'
    description = 'Convert PDF files to HTML'
    file_types  = set(['pdf'])
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@ -18,9 +18,52 @@ class Font(object):
        self.color = spec.get('color')
        self.family = spec.get('family')
-class Text(object):
+class Column(object):
-    def __init__(self, text, font_map, opts, log):
+    def __init__(self):
        self.left = self.right = self.top = self.bottom = 0
        self.width = self.height = 0
        self.elements = []
    def add(self, elem):
        if elem in self.elements: return
        self.elements.append(elem)
        self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
        self.top = self.elements[0].top
        self.bottom = self.elements[-1].bottom
        self.left, self.right = sys.maxint, 0
        for x in self:
            self.left = min(self.left, x.left)
            self.right = max(self.right, x.right)
        self.width, self.height = self.right-self.left, self.bottom-self.top
    def __iter__(self):
        for x in self.elements:
            yield x
 class Element(object):
    def __eq__(self, other):
        return self.id == other.id
    def __hash__(self):
        return hash(self.id)
 class Image(Element):
    def __init__(self, img, opts, log, idc):
        self.opts, self.log = opts, log
        self.id = idc.next()
        self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \
          map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth',
              'iheight')))
        self.src = img.get('src')
 class Text(Element):
    def __init__(self, text, font_map, opts, log, idc):
        self.id = idc.next()
        self.opts, self.log = opts, log
        self.font_map = font_map
        self.top, self.left, self.width, self.height = map(float, map(text.get,
@ -90,47 +133,6 @@ class Interval(object):
        return hash('(%f,%f)'%self.left, self.right)
 class HorizontalBox(object):
    def __init__(self, base_text):
        self.texts = [base_text]
        self.bottom = base_text.bottom
        self.number_of_columns = None
        self.column_map = {}
    def append(self, t):
        self.texts.append(t)
    def sort(self, left_margin, right_margin):
        self.texts.sort(cmp=lambda x,y: cmp(x.left, y.left))
        self.top, self.bottom = sys.maxint, 0
        for t in self.texts:
            self.top = min(self.top, t.top)
            self.bottom = max(self.bottom, t.bottom)
        self.left = self.texts[0].left
        self.right = self.texts[-1].right
        self.gaps = []
        for i, t in enumerate(self.texts[1:]):
            gap = Interval(self.texts[i].right, t.left)
            if gap.width > 3:
                self.gaps.append(gap)
        left = Interval(left_margin, self.texts[0].left)
        if left.width > 3:
            self.gaps.insert(0, left)
        right = Interval(self.texts[-1].right, right_margin)
        if right.width > 3:
            self.gaps.append(right)
    def has_intersection_with(self, gap):
        for g in self.gaps:
            if g.intersection(gap):
                return True
        return False
    def identify_columns(self, column_gaps):
        self.number_of_columns = len(column_gaps) + 1
 class Page(object):
    # Fraction of a character width that two strings have to be apart,
@ -141,8 +143,10 @@ class Page(object):
    # for them to be considered to be part of the same text fragment
    LINE_FACTOR = 0.4
    YFUZZ = 1.5
-    def __init__(self, page, font_map, opts, log):
+
    def __init__(self, page, font_map, opts, log, idc):
        self.opts, self.log = opts, log
        self.font_map = font_map
        self.number = int(page.get('number'))
@ -154,7 +158,7 @@ class Page(object):
        self.left_margin, self.right_margin = self.width, 0
        for text in page.xpath('descendant::text'):
-            self.texts.append(Text(text, self.font_map, self.opts, self.log))
+            self.texts.append(Text(text, self.font_map, self.opts, self.log, idc))
            text = self.texts[-1]
            self.left_margin = min(text.left, self.left_margin)
            self.right_margin = max(text.right, self.right_margin)
@ -162,16 +166,22 @@ class Page(object):
        self.textwidth = self.right_margin - self.left_margin
        self.font_size_stats = {}
        self.average_text_height = 0
        for t in self.texts:
            if t.font_size not in self.font_size_stats:
                self.font_size_stats[t.font_size] = 0
            self.font_size_stats[t.font_size] += len(t.text_as_string)
            self.average_text_height += t.height
        self.average_text_height /= len(self.texts)
        self.font_size_stats = FontSizeStats(self.font_size_stats)
        self.coalesce_fragments()
-        #self.identify_columns()
+        self.elements = list(self.texts)
        for img in page.xpath('descendant::img'):
            self.elements.append(Image(img, self.opts, self.log, idc))
        self.elements.sort(cmp=lambda x,y:cmp(x.top, y.top))
    def coalesce_fragments(self):
@ -196,46 +206,50 @@ class Page(object):
            if match is not None:
                self.texts.remove(match)
-    def sort_into_horizontal_boxes(self, document_font_size_stats):
+    def first_pass(self):
-        self.horizontal_boxes = []
+        self.regions = []
        if not self.elements:
            return
        for i, x in enumerate(self.elements):
            x.idx = i
        self.current_region = None
        processed = set([])
        for x in self.elements:
            if x in processed: continue
            elems = set(self.find_elements_in_row_of(x))
            columns = self.sort_into_columns(x, elems)
            processed.update(elems)
            columns
-        def find_closest_match(text):
+    def sort_into_columns(self, elem, neighbors):
-            'Return horizontal box whose bottom is closest to text or None'
+        columns = [Column()]
-            min, ans = 3.1, None
+        columns[0].add(elem)
-            for hb in self.horizontal_boxes:
+        for x in neighbors:
-                diff = abs(text.bottom - hb.bottom)
+            added = False
-                if diff < min:
+            for c in columns:
-                    diff, ans = min, hb
+                if c.contains(x):
-            return ans
+                    c.add(x)
-
+                    added = True
-        for t in self.texts:
+                    break
-            hb = find_closest_match(t)
+            if not added:
-            if hb is None:
+                columns.append(Column())
-                self.horizontal_boxes.append(HorizontalBox(t))
+                columns[-1].add(x)
-            else:
+                columns.sort(cmp=lambda x,y:cmp(x.left, y.left))
-                hb.append(t)
+        return columns
        for hb in self.horizontal_boxes:
            hb.sort(self.left_margin, self.right_margin)
        self.horizontal_boxes.sort(cmp=lambda x,y: cmp(x.bottom, y.bottom))
    def identify_columns(self):
        def neighborhood(i):
            if i == len(self.horizontal_boxes)-1:
                return self.horizontal_boxes[i-2:i]
            if i == len(self.horizontal_boxes)-2:
                return (self.horizontal_boxes[i-1], self.horizontal_boxes[i+1])
            return self.horizontal_boxes[i+1], self.horizontal_boxes[i+2]
        for i, hbox in enumerate(self.horizontal_boxes):
            n1, n2 = neighborhood(i)
            for gap in hbox.gaps:
                gap.is_column_gap =  n1.has_intersection_with(gap) and \
                    n2.has_intersection_with(gap)
    def find_elements_in_row_of(self, x):
        interval = Interval(x.top - self.YFUZZ * self.average_text_height,
                x.top + self.YFUZZ*(1+self.average_text_height))
        h_interval = Interval(x.left, x.right)
        m = max(0, x.idx-15)
        for y in self.elements[m:x.idx+15]:
            if y is not x:
                y_interval = Interval(y.top, y.bottom)
                x_interval = Interval(y.left, y.right)
                if interval.intersection(y_interval).width > \
                    0.5*self.average_text_height and \
                    x_interval.intersection(h_interval).width <= 0:
                    yield y
 class PDFDocument(object):
@ -244,6 +258,7 @@ class PDFDocument(object):
        self.opts, self.log = opts, log
        parser = etree.XMLParser(recover=True)
        self.root = etree.fromstring(xml, parser=parser)
        idc = iter(xrange(sys.maxint))
        self.fonts = []
        self.font_map = {}
@ -256,14 +271,15 @@ class PDFDocument(object):
        self.page_map = {}
        for page in self.root.xpath('//page'):
-            page = Page(page, self.font_map, opts, log)
+            page = Page(page, self.font_map, opts, log, idc)
            self.page_map[page.id] = page
            self.pages.append(page)
        self.collect_font_statistics()
        for page in self.pages:
-            page.sort_into_horizontal_boxes(self.font_size_stats)
+            page.document_font_stats = self.font_size_stats
            page.first_pass()
    def collect_font_statistics(self):
        self.font_size_stats = {}
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -31,6 +31,8 @@ class TXTInput(InputFormatPlugin):
        OptionRecommendation(name='markdown', recommended_value=False,
            help=_('Run the text input through the markdown pre-processor. To '
                'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
        OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
            help=_('Do not insert a Table of Contents into the output text.')),
    ])
    def convert(self, stream, options, file_ext, log,
@ -50,10 +52,10 @@ class TXTInput(InputFormatPlugin):
        if options.markdown:
            log.debug('Running text though markdown conversion...')
            try:
-                html = convert_markdown(txt)
+                html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
            except RuntimeError:
                raise ValueError('This txt file has malformed markup, it cannot be'
-                    'converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
+                    ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
        else:
            html = convert_basic(txt)
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -39,10 +39,11 @@ def convert_basic(txt, title=''):
    return HTML_TEMPLATE % (title, '\n'.join(lines))
-def convert_markdown(txt, title=''):
+def convert_markdown(txt, title='', disable_toc=False):
    md = markdown.Markdown(
-        extensions=['footnotes', 'tables', 'toc'],
+          extensions=['footnotes', 'tables', 'toc'],
-        safe_mode=False,)
+          extension_configs={"toc": {"disable_toc": disable_toc}},
          safe_mode=False)
    return HTML_TEMPLATE % (title, md.convert(txt))
 def separate_paragraphs_single_line(txt):
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -605,9 +605,9 @@ def build_forms(srcdir, info=None):
            if form.endswith('viewer%smain.ui'%os.sep):
                info('\t\tPromoting WebView')
                dat = dat.replace('self.view = QtWebKit.QWebView(', 'self.view = DocumentView(')
-                if iswindows:
+                dat = dat.replace('self.view = QWebView(', 'self.view = DocumentView(')
-                    dat = dat.replace('self.view = QWebView(', 'self.view = DocumentView(')
+                dat = dat.replace('from QtWebKit.QWebView import QWebView',
-                    dat = dat.replace('from QtWebKit.QWebView import QWebView', '')
+                        'from PyQt4 import QtWebKit\nfrom PyQt4.QtWebKit import QWebView')
                dat += '\n\nfrom calibre.gui2.viewer.documentview import DocumentView'
            open(compiled_form, 'wb').write(dat)
--- a/src/calibre/gui2/convert/txt_input.py
+++ b/src/calibre/gui2/convert/txt_input.py
@ -14,6 +14,6 @@ class PluginWidget(Widget, Ui_Form):
    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
        Widget.__init__(self, parent, 'txt_input',
-            ['single_line_paras', 'print_formatted_paras', 'markdown'])
+            ['single_line_paras', 'print_formatted_paras', 'markdown', 'markdown_disable_toc'])
        self.db, self.book_id = db, book_id
        self.initialize_options(get_option, get_help, db, book_id)
--- a/src/calibre/gui2/convert/txt_input.ui
+++ b/src/calibre/gui2/convert/txt_input.ui
@ -14,19 +14,6 @@
   <string>Form</string>
  </property>
  <layout class="QGridLayout" name="gridLayout">
   <item row="4" column="0">
    <spacer name="verticalSpacer">
     <property name="orientation">
      <enum>Qt::Vertical</enum>
     </property>
     <property name="sizeHint" stdset="0">
      <size>
       <width>20</width>
       <height>213</height>
      </size>
     </property>
    </spacer>
   </item>
   <item row="0" column="0">
    <widget class="QCheckBox" name="opt_single_line_paras">
     <property name="text">
@ -34,6 +21,13 @@
     </property>
    </widget>
   </item>
   <item row="1" column="0">
    <widget class="QCheckBox" name="opt_print_formatted_paras">
     <property name="text">
      <string>Assume print formatting</string>
     </property>
    </widget>
   </item>
   <item row="2" column="0">
    <widget class="QCheckBox" name="opt_markdown">
     <property name="text">
@ -51,15 +45,45 @@
     </property>
    </widget>
   </item>
-   <item row="1" column="0">
+   <item row="4" column="0">
-    <widget class="QCheckBox" name="opt_print_formatted_paras">
+    <widget class="QCheckBox" name="opt_markdown_disable_toc">
     <property name="text">
-      <string>Assume print formatting</string>
+      <string>Do not insert Table of Contents into output text when using markdown</string>
     </property>
    </widget>
   </item>
   <item row="5" column="0">
    <spacer name="verticalSpacer">
     <property name="orientation">
      <enum>Qt::Vertical</enum>
     </property>
     <property name="sizeHint" stdset="0">
      <size>
       <width>20</width>
       <height>213</height>
      </size>
     </property>
    </spacer>
   </item>
  </layout>
 </widget>
 <resources/>
- <connections/>
+<connections>
  <connection>
   <sender>opt_markdown</sender>
   <signal>toggled(bool)</signal>
   <receiver>opt_markdown_disable_toc</receiver>
   <slot>setEnabled(bool)</slot>
   <hints>
    <hint type="sourcelabel">
     <x>76</x>
     <y>80</y>
    </hint>
    <hint type="destinationlabel">
     <x>418</x>
     <y>105</y>
    </hint>
   </hints>
  </connection>
 </connections>
 </ui>
--- a/src/calibre/gui2/dialogs/tag_editor.py
+++ b/src/calibre/gui2/dialogs/tag_editor.py
@ -6,6 +6,7 @@ from PyQt4.QtGui import QDialog
 from calibre.gui2.dialogs.tag_editor_ui import Ui_TagEditor
 from calibre.gui2 import qstring_to_unicode
 from calibre.gui2 import question_dialog, error_dialog
 from calibre.constants import islinux
 class TagEditor(QDialog, Ui_TagEditor):
@ -42,7 +43,8 @@ class TagEditor(QDialog, Ui_TagEditor):
        self.connect(self.add_tag_button, SIGNAL('clicked()'), self.add_tag)
        self.connect(self.delete_button,  SIGNAL('clicked()'), self.delete_tags)
        self.connect(self.add_tag_input,  SIGNAL('returnPressed()'), self.add_tag)
-        self.connect(self.available_tags, SIGNAL('itemActivated(QListWidgetItem*)'), self.apply_tags)
+        if not islinux:
            self.connect(self.available_tags, SIGNAL('itemActivated(QListWidgetItem*)'), self.apply_tags)
        self.connect(self.applied_tags,   SIGNAL('itemActivated(QListWidgetItem*)'), self.unapply_tags)
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@ -576,6 +576,13 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
        self.location_view.setCurrentIndex(self.location_view.model().index(0))
        if self.cover_flow is not None and dynamic.get('cover_flow_visible', False):
            self.status_bar.cover_flow_button.toggle()
        if dynamic.get('tag_view_visible', False):
            self.status_bar.tag_view_button.toggle()
    def resizeEvent(self, ev):
        MainWindow.resizeEvent(self, ev)
        self.search.setMaximumWidth(self.width()-150)
@ -1837,6 +1844,8 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
    def write_settings(self):
        config.set('main_window_geometry', self.saveGeometry())
        dynamic.set('sort_column', self.library_view.model().sorted_on)
        dynamic.set('tag_view_visible', self.tags_view.isVisible())
        dynamic.set('cover_flow_visible', self.cover_flow.isVisible())
        self.library_view.write_settings()
        if self.device_connected:
            self.save_device_view_settings()
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -924,7 +924,10 @@ class LibraryDatabase2(LibraryDatabase):
            fmt_path = os.path.join(path, name+format)
            if os.path.exists(fmt_path):
                return fmt_path
-            candidates = glob.glob(os.path.join(path, '*'+format))
+            try:
                candidates = glob.glob(os.path.join(path, '*'+format))
            except: # If path contains strange characters this throws an exc
                candidates = []
            if format and candidates and os.path.exists(candidates[0]):
                shutil.copyfile(candidates[0], fmt_path)
                return fmt_path
@ -1122,10 +1125,18 @@ class LibraryDatabase2(LibraryDatabase):
        self.set_path(id, True)
        self.notify('metadata', [id])
-    def set_metadata(self, id, mi):
+    def set_metadata(self, id, mi, ignore_errors=False):
        '''
        Set metadata for the book `id` from the `MetaInformation` object `mi`
        '''
        def doit(func, *args, **kwargs):
            try:
                func(*args, **kwargs)
            except:
                if ignore_errors:
                    traceback.print_exc()
                else:
                    raise
        if mi.title:
            self.set_title(id, mi.title)
        if not mi.authors:
@ -1135,29 +1146,29 @@ class LibraryDatabase2(LibraryDatabase):
            authors += string_to_authors(a)
        self.set_authors(id, authors, notify=False)
        if mi.author_sort:
-            self.set_author_sort(id, mi.author_sort, notify=False)
+            doit(self.set_author_sort, id, mi.author_sort, notify=False)
        if mi.publisher:
-            self.set_publisher(id, mi.publisher, notify=False)
+            doit(self.set_publisher, id, mi.publisher, notify=False)
        if mi.rating:
-            self.set_rating(id, mi.rating, notify=False)
+            doit(self.set_rating, id, mi.rating, notify=False)
        if mi.series:
-            self.set_series(id, mi.series, notify=False)
+            doit(self.set_series, id, mi.series, notify=False)
        if mi.cover_data[1] is not None:
-            self.set_cover(id, mi.cover_data[1])
+            doit(self.set_cover, id, mi.cover_data[1])
        elif mi.cover is not None and os.access(mi.cover, os.R_OK):
-            self.set_cover(id, open(mi.cover, 'rb').read())
+            doit(self.set_cover, id, open(mi.cover, 'rb'))
        if mi.tags:
-            self.set_tags(id, mi.tags, notify=False)
+            doit(self.set_tags, id, mi.tags, notify=False)
        if mi.comments:
-            self.set_comment(id, mi.comments, notify=False)
+            doit(self.set_comment, id, mi.comments, notify=False)
        if mi.isbn and mi.isbn.strip():
-            self.set_isbn(id, mi.isbn, notify=False)
+            doit(self.set_isbn, id, mi.isbn, notify=False)
        if mi.series_index:
-            self.set_series_index(id, mi.series_index, notify=False)
+            doit(self.set_series_index, id, mi.series_index, notify=False)
        if mi.pubdate:
-            self.set_pubdate(id, mi.pubdate, notify=False)
+            doit(self.set_pubdate, id, mi.pubdate, notify=False)
        if getattr(mi, 'timestamp', None) is not None:
-            self.set_timestamp(id, mi.timestamp, notify=False)
+            doit(self.set_timestamp, id, mi.timestamp, notify=False)
        self.set_path(id, True)
        self.notify('metadata', [id])
@ -1353,7 +1364,10 @@ class LibraryDatabase2(LibraryDatabase):
    def set_series_index(self, id, idx, notify=True):
        if idx is None:
            idx = 1.0
-        idx = float(idx)
+        try:
            idx = float(idx)
        except:
            idx = 1.0
        self.conn.execute('UPDATE books SET series_index=? WHERE id=?', (idx, id))
        self.conn.commit()
        self.data.set(id, FIELD_MAP['series_index'], idx, row_is_id=True)
@ -1513,7 +1527,7 @@ class LibraryDatabase2(LibraryDatabase):
        id = obj.lastrowid
        self.data.books_added([id], self)
        self.set_path(id, True)
-        self.set_metadata(id, mi)
+        self.set_metadata(id, mi, ignore_errors=True)
        for path in formats:
            ext = os.path.splitext(path)[1][1:].lower()
            if ext == 'opf':
--- a/src/calibre/library/server.py
+++ b/src/calibre/library/server.py
@ -79,7 +79,7 @@ class LibraryServer(object):
            </book>
        ''')
-    MOBILE_UA = re.compile('(?i)(?:iPhone|Opera Mini|NetFront|webOS|Mobile|Android|imode|DoCoMo|Minimo|Blackberry|MIDP|Symbian)')
+    MOBILE_UA = re.compile('(?i)(?:iPhone|Opera Mini|NetFront|webOS|Mobile|Android|imode|DoCoMo|Minimo|Blackberry|MIDP|Symbian|HD2)')
    MOBILE_BOOK = textwrap.dedent('''\
    <tr xmlns:py="http://genshi.edgewall.org/">
@ -90,7 +90,7 @@ class LibraryServer(object):
        <py:for each="format in r[13].split(',')">
            <span class="button"><a href="/get/${format}/${authors}-${r[1]}_${r[0]}.${format}">${format.lower()}</a></span>&nbsp;
        </py:for>
-       ${r[1]} by ${authors} - ${r[6]/1024}k - ${r[3] if r[3] else ''} ${pubdate} ${'['+r[7]+']' if r[7] else ''}
+       ${r[1]}${(' ['+r[9]+'-'+r[10]+']') if r[9] else ''} by ${authors} - ${r[6]/1024}k - ${r[3] if r[3] else ''} ${pubdate} ${'['+r[7]+']' if r[7] else ''}
    </td>
    </tr>
    ''')
@ -802,7 +802,7 @@ class LibraryServer(object):
    @expose
-    def get(self, what, id):
+    def get(self, what, id, *args, **kwargs):
        'Serves files, covers, thumbnails from the calibre database'
        try:
            id = int(id)
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -140,11 +140,11 @@ First install the Stanza reader on your iPhone using iTunes.
  * Convert the books you want to read on your iPhone to EPUB format by selecting them and clicking the Convert button.
  * Turn on the Content Server in |app|'s preferences and leave |app| running.
-Now you should be able to access your books on your iPhone by opening Stanza and going to "Shared Books". Under Shared Books you will see an entry "Book in calibre". If you don't, make sure your iPhone is connected using the WiFi network in your house, not 3G. If the |app| catalog is still not detected in Stanza, you can add it manually in Stanza, by clicking "Online Catalog" and the clicking the plus icon in the lower right corner to add a new catalog. In the Add Catalog screen enter whatever name you like and in the URL field, enter the following::
+Now you should be able to access your books on your iPhone by opening Stanza. Go to "Get Books" and then click the "Shared" tab. Under Shared you will see an entry "Books in calibre". If you don't, make sure your iPhone is connected using the WiFi network in your house, not 3G. If the |app| catalog is still not detected in Stanza, you can add it manually in Stanza. To do this, click the "Shared" tab, then click the "Edit" button and then click "Add book source" to add a new book source. In the Add Book Source screen enter whatever name you like and in the URL field, enter the following::
    http://192.168.1.2:8080/
-Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address.  
+Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address.   Now click "Save" and you are done.
 How do I use |app| with my Android phone?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/src/calibre/translations/ar.po
+++ b/src/calibre/translations/ar.po
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
--- a/src/calibre/translations/cs.po
+++ b/src/calibre/translations/cs.po
--- a/src/calibre/translations/de.po
+++ b/src/calibre/translations/de.po
--- a/src/calibre/translations/eo.po
+++ b/src/calibre/translations/eo.po
--- a/src/calibre/translations/es.po
+++ b/src/calibre/translations/es.po
--- a/src/calibre/translations/fr.po
+++ b/src/calibre/translations/fr.po
--- a/src/calibre/translations/it.po
+++ b/src/calibre/translations/it.po
--- a/src/calibre/translations/ja.po
+++ b/src/calibre/translations/ja.po
--- a/src/calibre/translations/lv.po
+++ b/src/calibre/translations/lv.po
--- a/src/calibre/translations/nb.po
+++ b/src/calibre/translations/nb.po
--- a/src/calibre/translations/nl.po
+++ b/src/calibre/translations/nl.po
--- a/src/calibre/translations/pl.po
+++ b/src/calibre/translations/pl.po
--- a/src/calibre/translations/ru.po
+++ b/src/calibre/translations/ru.po
--- a/src/calibre/translations/sq.po
+++ b/src/calibre/translations/sq.po
--- a/src/calibre/translations/sv.po
+++ b/src/calibre/translations/sv.po
--- a/src/calibre/translations/te.po
+++ b/src/calibre/translations/te.po
--- a/src/calibre/translations/tr.po
+++ b/src/calibre/translations/tr.po
--- a/src/calibre/translations/zh_TW.po
+++ b/src/calibre/translations/zh_TW.po
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -191,9 +191,9 @@ class RecursiveFetcher(object):
        if isinstance(url, unicode):
            url = url.encode('utf-8')
        # Not sure is this is really needed as I think mechanize
-        # handles quoting automatically, but leaving it in
+        # handles quoting automatically, but leaving it
        # in case it breaks something
-        if re.search(r'\s+|,', url) is not None:
+        if re.search(r'\s+', url) is not None:
            purl = list(urlparse.urlparse(url))
            for i in range(2, 6):
                purl[i] = quote(purl[i])