Merge from trunk

2026-05-01 05:00:40 -04:00 · 2012-01-17 10:52:47 +01:00 · 2012-01-17 10:52:47 +01:00 · c68075bc08
commit c68075bc08
parent 319fb366bf ffeb865728
14 changed files with 320 additions and 347 deletions
--- a/recipes/blic.recipe
+++ b/recipes/blic.recipe
@ -1,6 +1,6 @@

 __license__   = 'GPL v3'
-__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
 '''
 blic.rs
 '''
@ -73,7 +73,10 @@ class Blic(BasicNewsRecipe):
    def print_version(self, url):
        return url + '/print'

-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']    
-        return soup
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.blic.rs/')
+        alink = soup.find('a', attrs={'id':'blic_naslovna_print'})
+        if alink:
+           return 'http://www.blic.rs' + alink['href']
+        return None
+       
--- a/recipes/oreilly_premium.recipe
+++ b/recipes/oreilly_premium.recipe
@ -1,8 +1,15 @@
+# Talking Points is not grabbing everything.
+# The look is right, but only the last one added?
 import re
 import time
 from calibre.web.feeds.recipes import BasicNewsRecipe
 # Allows the Python soup converter, which makes parsing easier.
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
+# strip ads and graphics
+# Current Column lacks a title.
+# Talking Points Memo - shorten title - Remove year and Bill's name
+# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
+# Newsletters: Talking Points Memos covered by cat12

 class OReillyPremium(BasicNewsRecipe):
    title           = u'OReilly Premium'
@ -19,7 +26,17 @@ class OReillyPremium(BasicNewsRecipe):
    # Don't go down
    recursions      = 0
    max_articles_per_feed = 2000
-    language = 'en'
+
+    debugMessages   = True
+
+    # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
+    catList = [ ["TV Archives",         'https://www.billoreilly.com/show?action=tvShowArchive', 'a',    {'class':['showLinks','homeLinks']},                   []],
+                ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
+                ["Daily Briefings",     'http://www.billoreilly.com/blog?categoryID=11',         True,   {'class':['defaultHeaderSmallLinks']},                 []],
+                ["Stratfor",            'http://www.billoreilly.com/blog?categoryID=5',          'a',    {'class':['blogLinks']},                               []],
+                ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
+                ["Current Column",      'https://www.billoreilly.com/currentcolumn',             'span', {'class':['defaultHeader']},                           []]
+              ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
@ -31,6 +48,8 @@ class OReillyPremium(BasicNewsRecipe):
            br.submit()
        return br

+    # Returns the best-guess print url.
+    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, baseURL, pageURL, printString):
        tagURL = pageURL
        soup = self.index_to_soup(pageURL)
@ -38,7 +57,6 @@ class OReillyPremium(BasicNewsRecipe):
            printText = soup.find('a', text=printString)
        else :
            print("Failed to find Print string "+printString+ " in "+pageURL)
-
        if printText:
            tag = printText.parent
            tagURL = baseURL+tag['href']
@ -47,177 +65,111 @@ class OReillyPremium(BasicNewsRecipe):
    def stripBadChars(self, inString) :
        return inString.replace("\'", "")

-
-    # returns a qualifying article list
-    def parseNoSpinArchives(self, baseURL, soupURL, debugMessages):
-        articleList = []
-        soup = self.index_to_soup(soupURL)
-        for div in soup.findAll(True, attrs={'class':['blogBody'], 'style':['padding-top:10px;']}):
-             a = div.find('a', href=True)
-             if not a:
-                 continue
-             # re == regex. [href] is the link
-             url = baseURL
-             url +=re.sub(r'\?.*', '', a['href'])
-             # Get print version
-             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
-             if printURL:
-                url = printURL
-             title = self.tag_to_string(a, use_alt=True).strip()
-             if debugMessages :
-                print("No Spin Archive Title:"+title+" at url: "+url)
-             description = 'None'
-             pubdate = time.strftime('%a, %d %b')
-             summary = div.find(True, attrs={'class':'summary'})
-             if summary:
-                 description = self.tag_to_string(summary, use_alt=False)
-             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
-        return articleList
-
-
-    def parseTVArchives(self, baseURL, soupURL, debugMessages):
-        # TV Archives page has some Ajax, so look for the static only.
-        articleList = []
-        soup = self.index_to_soup(soupURL)
-        if debugMessages :
-           print("In parseTVArchives")
-        for div in soup.findAll('a', {'class':['showLinks','homeLinks']}):
-             a = div
-             url = baseURL
-             url +=a['href']
-             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
-             if printURL:
-                url = printURL
-             title = self.tag_to_string(a, use_alt=True).strip()
-             title = self.stripBadChars(title)
-             if debugMessages :
-                print("TV Archive "+title+" at url: "+url)
-             description = 'None'
-             pubdate = time.strftime('%a, %d %b')
-             summary = div.find(True, attrs={'class':'summary'})
-             if summary:
-                 description = self.tag_to_string(summary, use_alt=False)
-             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
-        if debugMessages :
-            print("Leaving TV Parse ")
-        return articleList
-
-    # Get Daily Briefing Archives
-    def parseDailyBriefs(self, baseURL, soupURL, debugMessages) :
-        print("Starting daily briefs")
-        articleList = []
-        soup = self.index_to_soup(soupURL)
-        for div in soup.findAll(True, attrs={'class':['defaultHeaderSmallLinks']}):
-             # re == regex. [href] is the link
-             url = baseURL
-             url +=re.sub(r'\?.*', '', div['href'])
-             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
-             if printURL:
-                url = printURL
-             title = div.contents[0]
-             if debugMessages :
-                print("Daily Brief - title:"+title+" at url: "+url)
-             description = 'None'
-             pubdate = time.strftime('%a, %d %b')
-             summary = div.find(True, attrs={'class':'summary'})
-             if summary:
-                 description = self.tag_to_string(summary, use_alt=False)
-             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
-        print("Leaving daily briefs")
-        return articleList
-
-    # Get the weekly Stratfor intelligence report
-    def parseStratfor(self, baseURL, soupURL, debugMessages):
-        # http://www.billoreilly.com/blog?categoryID=5
-        articleList = []
-        soup = self.index_to_soup(soupURL)
-        if debugMessages :
-           print("In parseStratfor")
-        a = soup.find('a', {'class':['blogLinks']})
-        url = baseURL
-        url +=a['href']
-        title = self.tag_to_string(a, use_alt=True).strip()
-        if debugMessages :
-            print("url: "+url)
-            print("title:"+title)
-        # Get Stratfor contents so we can get the real title.
-        stratSoup = self.index_to_soup(url)
-        title = stratSoup.html.head.title.string
-        stratIndex = title.find('Stratfor.com:', 0)
-        if (stratIndex > -1) :
-            title = title[stratIndex+14:-1]
-        # Look for first blogBody  <td class="blogBody"
-        stratBody = stratSoup.find('td', {'class':['blogBody']})
-        if debugMessages :
-            print("Strat content title:"+title)
-            print("Strat body: "+ stratBody.contents[0])
-        description = 'None'
-        pubdate = time.strftime('%a, %d %b')
-        articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
-        if debugMessages :
-           print("Leaving Stratfor Parse ")
-        return articleList
-
-    def parseTalkingPoints(self, baseURL, soupURL, debugMessages) :
-        # Look for blogDate.  That's got the date...  Then the next blogBody has the title.  and then an anchor with class "homeBlogReadMore bold" has the URL.
-        articleList = []
-        soup = self.index_to_soup(soupURL)
-        if debugMessages :
-            print("Starting Talking Points")
-        topDate =  soup.find("td", "blogBody")
-        if not topDate :
-            print("Failed to find date in Talking Points")
-        # This page has the contents in double-wrapped tables!
-        # tableParent = topDate.parent.parent
-        myTable = topDate.findParents('table')[0]
-        upOneTable = myTable.findParents('table')[0]
-        upTwo = upOneTable.findParents('table')[0]
-        # Now navigate rows of upTwo
-        if debugMessages :
-            print("Entering rows")
-        for rows in upTwo.findChildren("tr", recursive=False):
-            # Inside top level table, each row is an article
-            rowTable = rows.find("table")
-            articleTable = rowTable.find("table")
-            articleTable = rows.find("tr")
-            # The middle table is just for formatting the article buffer... but this means we can skip the inner table.
-            blogDate = articleTable.find("a","blogDate").contents[0]
-            # Skip to second blogBody for this.
-            blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
-            blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
-            # re == regex. [href] is the link
-            url = baseURL
-            url +=re.sub(r'\?.*', '', blogURL)
-            title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
-            if debugMessages :
-                print("Talking Points Memo title "+title+" at url: "+url)
+    def parseGeneric(self, baseURL):
+        # Does a generic parsing of the articles.  There are six categories (0-5)
+        # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
+        # NoSpin and TV are generic
+        fullReturn = []
+        for i in range(len(self.catList)) :
+            articleList = []
+            soup = self.index_to_soup(self.catList[i][1])
+            # Set defaults
            description = 'None'
            pubdate = time.strftime('%a, %d %b')
-            articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
-        print("Exiting parseTalkingPoints\n")
-        return articleList
+            # Problem: 0-2 create many in an array
+            # 3-5 create one.
+            # So no for-div for 3-5

-    def parseCurrentColumn(self, baseURL, soupURL, debugMessages) :
-        # Only needed to get the column title.  Otherwise it's all good already; there's only one column
-        articleList = []
-        soup = self.index_to_soup(soupURL)
-        titleSpan = soup.find('span', {'class':['defaultHeader']})
-        title = titleSpan.contents[0]
-        # Get Print URL since it's available
-        printURL = self.extractPrintURL(baseURL, soupURL, "Print This Article")
-        if printURL:
-            print("Found print URL")
-            url = printURL
-        if debugMessages :
-            print("url: "+url)
-            print("title:"+title)
-        description = 'None'
-        pubdate = time.strftime('%a, %d %b')
-        articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
-        if debugMessages :
-           print("Leaving Stratfor Parse ")
-        return articleList
+            if i < 3 :
+                for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
+                     print(div)
+                     if i == 1:
+                        a = div.find('a', href=True)
+                     else :
+                        a = div
+                     print(a)
+                     summary = div.find(True, attrs={'class':'summary'})
+                     if summary:
+                         description = self.tag_to_string(summary, use_alt=False)
+                     if not a:
+                         continue
+                     # url = baseURL+re.sub(r'\?.*', '', a['href'])
+                     url = baseURL+a['href']
+                     if i < 2 :
+                        url = self.extractPrintURL(baseURL, url, "Print this entry")
+                        title = self.tag_to_string(a, use_alt=True).strip()
+                     elif i == 2 :
+                        # Daily Briefs
+                        url = self.extractPrintURL(baseURL, url, "Print this entry")
+                        title =  div.contents[0]
+                     if self.debugMessages :
+                        print(title+" @ "+url)
+                     articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))

+            elif i == 3 :   # Stratfor
+                a = soup.find('a', self.catList[i][3])
+                if a is None :
+                    continue
+                url = baseURL+a['href']
+                title = self.tag_to_string(a, use_alt=True).strip()
+                # Get Stratfor contents so we can get the real title.
+                stratSoup = self.index_to_soup(url)
+                title = stratSoup.html.head.title.string
+                stratIndex = title.find('Stratfor.com:', 0)
+                if (stratIndex > -1) :
+                    title = title[stratIndex+14:-1]
+                # Look for first blogBody  <td class="blogBody"
+                # Changed 12 Jan 2012 - new page format
+                #stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
+                #stratBody = stratSoup.find('td', {'class':['blogBody']})
+            elif i == 4 :      # Talking Points
+                topDate =  soup.find("td", "blogBody")
+                if not topDate :
+                    print("Failed to find date in Talking Points")
+                # This page has the contents in double-wrapped tables!
+                myTable = topDate.findParents('table')[0]
+                if myTable is not None:
+                    upOneTable = myTable.findParents('table')[0]
+                    if upOneTable is not None:
+                        upTwo = upOneTable.findParents('table')[0]
+                if upTwo is None:
+                    continue
+                # Now navigate rows of upTwo
+                if self.debugMessages :
+                    print("Entering rows")
+                for rows in upTwo.findChildren("tr", recursive=False):
+                    # Inside top level table, each row is an article
+                    rowTable = rows.find("table")
+                    articleTable = rowTable.find("table")
+                    # This looks wrong.
+                    articleTable = rows.find("tr")
+                    # The middle table is just for formatting the article buffer... but this means we can skip the inner table.
+                    blogDate = articleTable.find("a","blogDate").contents[0]
+                    # Skip to second blogBody for this.
+                    blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
+                    blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
+                    url = baseURL+re.sub(r'\?.*', '', blogURL)
+                    title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
+                    if self.debugMessages :
+                        print("Talking Points Memo title "+title+" at url: "+url)
+                    pubdate = time.strftime('%a, %d %b')
+                    articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
+            else :       # Current Column
+                titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
+                if titleSpan is None :
+                    continue
+                title = titleSpan.contents[0]
+                url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
+            if i == 3 or i == 5 :
+                 if self.debugMessages :
+                    print(self.catList[i][0]+" Title:"+title+" at url: "+url)
+                 summary = div.find(True, attrs={'class':'summary'})
+                 if summary:
+                     description = self.tag_to_string(summary, use_alt=False)
+                 articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
+            self.catList[i][3] = articleList
+            fullReturn.append((self.catList[i][0], articleList))
+        return fullReturn

    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
@ -231,27 +183,8 @@ class OReillyPremium(BasicNewsRecipe):
    # this is used instead of BasicNewsRecipe.parse_feeds().
    def parse_index(self):
        # Parse the page into Python Soup
-        debugMessages = True
        baseURL = "https://www.billoreilly.com"
-        def feed_title(div):
-            return ''.join(div.findAll(text=True, recursive=False)).strip()
-        # [] is list, {} is empty mapping.
-        articleList = []
-        ans = []
-        showList = self.parseTVArchives(baseURL, 'https://www.billoreilly.com/show?action=tvShowArchive', debugMessages)
-        articleList = self.parseNoSpinArchives(baseURL, 'https://www.billoreilly.com/blog?categoryID=7', debugMessages)
-        stratList = self.parseStratfor(baseURL, 'http://www.billoreilly.com/blog?categoryID=5', debugMessages)
-        dailyBriefs = self.parseDailyBriefs(baseURL, 'http://www.billoreilly.com/blog?categoryID=11', debugMessages)
-        talkingPoints = self.parseTalkingPoints(baseURL, 'https://www.billoreilly.com/blog?categoryID=12', debugMessages)
-        currentColumn = self.parseCurrentColumn(baseURL, 'https://www.billoreilly.com/currentcolumn', debugMessages)
-        # Below, { x:y, a:b } creates a dictionary.   We return a tuple of a title and list of dict...
-        # Lists are constructed with square brackets, separating items with commas: [a, b, c].  Tuples are constructed by the comma operator (not within square brackets), with or without enclosing parentheses, but an empty tuple must have the enclosing parentheses, such as a, b, c or (). A single item tuple must have a trailing comma, such as (d,).
-        # Shows first two if talking points and no spin news.  Also if they are TV Shows ande Stratfor Weekly, also if Daily Briefing and Curren Column
-        # So all work individually.  No idea why only getting first two in TOC now.
-        ans = [("Talking Points Memos", talkingPoints),("No Spin News", articleList),("TV Shows", showList),("Stratfor Weekly",stratList), ("Daily Briefing", dailyBriefs),("Current Column", currentColumn)]
-        if debugMessages :
-            print ans
-        return ans
+        return self.parseGeneric(baseURL)

    def preprocess_html(self, soup):
        refresh = soup.find('meta', {'http-equiv':'refresh'})
--- a/recipes/variety.recipe
+++ b/recipes/variety.recipe
@ -1,5 +1,5 @@
 __license__   = 'GPL v3'
-__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2012, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.variety.com
 '''
@ -14,11 +14,11 @@ class Variety(BasicNewsRecipe):
    max_articles_per_feed  = 100
    no_stylesheets         = True
    use_embedded_content   = False
-    encoding               = 'cp1252'
+    encoding               = 'utf8'
    publisher              = 'Red Business Information'
    category               = 'Entertainment Industry News, Daily Variety, Movie Reviews, TV, Awards, Oscars, Cannes, Box Office, Hollywood'
    language               = 'en'
-    masthead_url           = 'http://a330.g.akamai.net/7/330/23382/20090528190853/www.variety.com/graphics/variety/Variety_logo_green_tm.gif'
+    masthead_url           = 'http://images1.variety.com/graphics/variety/Variety_logo_green_tm.gif'
    extra_css              = ' body{font-family: Georgia,"Times New Roman",Times,Courier,serif } img{margin-bottom: 1em} '

    conversion_options = {
@ -30,17 +30,10 @@ class Variety(BasicNewsRecipe):

    remove_tags = [dict(name=['object','link','map'])]

-    keep_only_tags = [dict(name='div', attrs={'id':'article'})]
+    keep_only_tags = [dict(name='div', attrs={'class':'art control'})]

    feeds = [(u'News & Articles', u'http://feeds.feedburner.com/variety/headlines' )]

    def print_version(self, url):
-        rpt = url.rpartition('?')[0]
-        artid = rpt.rpartition('/')[2]
-        catidr = url.rpartition('categoryid=')[2]
-        catid = catidr.partition('&')[0]
-        return 'http://www.variety.com/index.asp?layout=print_story&articleid=' + artid + '&categoryid=' + catid
-
-
-    def preprocess_html(self, soup):
-        return self.adeify_images(soup)
+        rpt = url.rpartition('.html')[0]
+        return rpt + '?printerfriendly=true'
--- a/recipes/villagevoice.recipe
+++ b/recipes/villagevoice.recipe
@ -0,0 +1,46 @@
+#!/usr/bin/env  python
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class VillageVoice(BasicNewsRecipe):
+
+    title        = 'Village Voice'
+    feeds        = [
+        ("Complete Issue", "http://villagevoice.com/syndication/issue"),
+        ("News", "http://villagevoice.com/syndication/section/news"),
+        ("Music", "http://villagevoice.com/syndication/section/music"),
+        ("Movies", "http://villagevoice.com/syndication/section/film"),
+        #("Restaurants", "http://villagevoice.com/syndication/section/dining"),
+        #("Music Events", "http://villagevoice.com/syndication/events?type=music"),
+        #("Calendar Events", "http://villagevoice.com/syndication/events"),
+        #("Promotional Events", "http://villagevoice.com/syndication/promoEvents"),
+        #("Restaurant Guide", "http://villagevoice.com/syndication/restaurants/search")
+        ]
+
+    auto_cleanup          = True
+    max_articles_per_feed = 50
+    masthead_url          = "http://assets.villagevoice.com/img/citylogo.png"
+    language = 'en'
+    __author__ = 'Barty'
+
+    seen_urls = []
+
+    # village voice breaks the article up into multiple pages, so
+    # parse page and grab the print url
+
+    url_regex = re.compile(r'\/content\/printVersion\/\d+',re.I)
+
+    def print_version(self, url):
+        if url in self.seen_urls:
+            return None
+        self.seen_urls.append( url)
+        soup = self.index_to_soup(url)
+        atag = soup.find('a',attrs={'href':self.url_regex})
+        if atag is None:
+            self.log('Warning: no print url found for '+url)
+        else:
+            m = self.url_regex.search(atag['href'])
+            if m:
+                url = 'http://www.villagevoice.com'+m.group(0)
+        return url
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@ -197,7 +197,7 @@ title_series_sorting = 'library_order'
 # For example, if the tweak is set to library_order, "The Lord of the Rings"
 # will become "Lord of the Rings, The". If the tweak is set to
 # strictly_alphabetic, it would remain "The Lord of the Rings". Note that the
-# formatter function raw_field will  return the base value for title and
+# formatter function raw_field will return the base value for title and
 # series regardless of the setting of this tweak.
 save_template_title_series_sorting = 'library_order'

--- a/src/calibre/ebooks/oeb/display/cfi.coffee
+++ b/src/calibre/ebooks/oeb/display/cfi.coffee
@ -13,6 +13,7 @@
 3. Much more comprehensive testing/error handling
 4. Properly encodes/decodes assertions
 5. Handles points in the padding of elements consistently
+ 6. Has a utility method to calculate the CFI for the current viewport position robustly

 To check if this script is compatible with the current browser, call
 window.cfi.is_compatible() it will throw an exception if not compatible.
@ -72,7 +73,7 @@ get_current_time = (target) -> # {{{
    fstr(ans)
 # }}}

-window_scroll_pos = (win) -> # {{{
+window_scroll_pos = (win=window) -> # {{{
    if typeof(win.pageXOffset) == 'number'
        x = win.pageXOffset
        y = win.pageYOffset
@ -86,18 +87,18 @@ window_scroll_pos = (win) -> # {{{
    return [x, y]
 # }}}

-viewport_to_document = (x, y, doc) -> # {{{
+viewport_to_document = (x, y, doc=window?.document) -> # {{{
+    until doc == window.document
+        # We are in a frame
+        frame = doc.defaultView.frameElement
+        rect = frame.getBoundingClientRect()
+        x += rect.left
+        y += rect.top
+        doc = frame.ownerDocument
    win = doc.defaultView
    [wx, wy] = window_scroll_pos(win)
    x += wx
    y += wy
-    if doc != window.document
-        # We are in a frame
-        node = win.frameElement
-        rect = node.getBoundingClientRect()
-        [vx, vy] = viewport_to_document(rect.left, rect.top, node.ownerDocument)
-        x += vx
-        y += vy
    return [x, y]
 # }}}

@ -157,7 +158,8 @@ class CanonicalFragmentIdentifier
    is_compatible(): Throws an error if the browser is not compatible with
                     this script

-    at(x, y): which maps a point to a CFI, if possible
+    at(x, y): Maps a point to a CFI, if possible
+    at_current(): Returns the CFI corresponding to the current viewport scroll location

    scroll_to(cfi): which scrolls the browser to a point corresponding to the
                    given cfi, and returns the x and y co-ordinates of the point.
@ -397,6 +399,8 @@ class CanonicalFragmentIdentifier
            if not cd
                break

+            # We have an embedded document, transforms x, y into the co-prd
+            # system of the embedded document's viewport
            rect = target.getBoundingClientRect()
            x -= rect.left
            y -= rect.top
@ -557,11 +561,73 @@ class CanonicalFragmentIdentifier
        null
    # }}}

-    current_cfi: () -> # {{{
+    at_current: () -> # {{{
        [winx, winy] = window_scroll_pos()
        [winw, winh] = [window.innerWidth, window.innerHeight]
+        max = Math.max
        winw = max(winw, 400)
        winh = max(winh, 600)
+        deltay = Math.floor(winh/50)
+        deltax = Math.floor(winw/25)
+        miny = max(-winy, -winh)
+        maxy = winh
+        minx = max(-winx, -winw)
+        maxx = winw
+
+        dist = (p1, p2) ->
+            Math.sqrt(Math.pow(p1[0]-p2[0], 2), Math.pow(p1[1]-p2[1], 2))
+
+        get_cfi = (ox, oy) ->
+            try
+                cfi = this.at(ox, oy)
+                point = this.point(cfi)
+            catch err
+                cfi = null
+
+            if point.range != null
+                r = point.range
+                rect = r.getClientRects()[0]
+
+                x = (point.a*rect.left + (1-point.a)*rect.right)
+                y = (rect.top + rect.bottom)/2
+                [x, y] = viewport_to_document(x, y, r.startContainer.ownerDocument)
+            else
+                node = point.node
+                r = node.getBoundingClientRect()
+                [x, y] = viewport_to_document(r.left, r.top, node.ownerDocument)
+                if typeof(point.x) == 'number' and node.offsetWidth
+                    x += (point.x*node.offsetWidth)/100
+                if typeof(point.y) == 'number' and node.offsetHeight
+                    y += (point.y*node.offsetHeight)/100
+
+            if dist(viewport_to_document(ox, oy), [x, y]) > 50
+                cfi = null
+
+            return cfi
+
+        x_loop = (cury) ->
+            for direction in [-1, 1]
+                delta = deltax * direction
+                curx = 0
+                until (direction < 0 and curx < minx) or (direction > 0 and curx > maxx)
+                    cfi = get_cfi(curx, cury)
+                    if cfi
+                        return cfi
+                    curx += delta
+            null
+
+        for direction in [-1, 1]
+            delta = deltay * direction
+            cury = 0
+            until (direction < 0 and cury < miny) or (direction > 0 and cury > maxy)
+                cfi = x_loop(cury, -1)
+                if cfi
+                    return cfi
+                cury += delta
+
+        # TODO: Return the CFI corresponding to the <body> tag
+        null
+
    # }}}

 if window?
--- a/src/calibre/ebooks/oeb/display/test-cfi/cfi-test.coffee
+++ b/src/calibre/ebooks/oeb/display/test-cfi/cfi-test.coffee
@ -59,26 +59,13 @@ mark_and_reload = (evt) ->
    setTimeout(fn, 1)
    null

-window_scroll_pos = (win) ->
-    if typeof(win.pageXOffset) == 'number'
-        x = win.pageXOffset
-        y = win.pageYOffset
-    else # IE < 9
-        if document.body and ( document.body.scrollLeft or document.body.scrollTop )
-            x = document.body.scrollLeft
-            y = document.body.scrollTop
-        else if document.documentElement and ( document.documentElement.scrollLeft or document.documentElement.scrollTop)
-            y = document.documentElement.scrollTop
-            x = document.documentElement.scrollLeft
-    return [x, y]
-
 frame_clicked = (evt) ->
    iframe = evt.target.ownerDocument.defaultView.frameElement
    # We know that the offset parent of the iframe is body
    # So we can easily calculate the event co-ords w.r.t. the browser window
-    [winx, winy] = window_scroll_pos(window)
-    x = evt.clientX + iframe.offsetLeft - winx
-    y = evt.clientY + iframe.offsetTop  - winy
+    rect = iframe.getBoundingClientRect()
+    x = evt.clientX + rect.left
+    y = evt.clientY + rect.top
    mark_and_reload({'clientX':x, 'clientY':y, 'button':evt.button})

 window.onload = ->
--- a/src/calibre/ebooks/oeb/display/test-cfi/iframe.html
+++ b/src/calibre/ebooks/oeb/display/test-cfi/iframe.html
@ -23,6 +23,7 @@
            indignation and dislike men who are so beguiled and demoralized by
            the charms of pleasure of the moment, so blinded by desire, that
            they cannot foresee</p>
+            <p><img src="marker.png" width="300" height="300" alt="Test image"/></p>

    </body>
 </html>
--- a/src/calibre/ebooks/oeb/display/test-cfi/index.html
+++ b/src/calibre/ebooks/oeb/display/test-cfi/index.html
@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <html>
    <head>
-        <title>Testing EPUB CFI</title>
+        <title>Testing cfi.coffee</title>
        <script type="text/javascript" src="cfi.coffee"></script>
        <script type="text/javascript" src="cfi-test.coffee"></script>
        <style type="text/css">
@ -46,7 +46,8 @@
    </head>
    <body>
        <div id="container">
-            <h1 id="first-h1">Testing EPUB CFI</h1>
+            <h1 id="first-h1">Testing cfi.coffee</h1>
+            <p>Click anywhere and the location will be marked with a marker, whose position is set via a CFI.</p>
            <p><a id="reset" href="/">Reset CFI to None</a></p>
            <h2>A div with scrollbars</h2>
            <p>Scroll down and click on some elements. Make sure to hit both
--- a/src/calibre/gui2/dialogs/scheduler.py
+++ b/src/calibre/gui2/dialogs/scheduler.py
@ -462,7 +462,7 @@ class Scheduler(QObject):
            delta = timedelta(days=self.oldest)
            try:
                ids = list(self.db.tags_older_than(_('News'),
-                    delta))
+                    delta, must_have_authors=['calibre']))
            except:
                # Happens if library is being switched
                ids = []
--- a/src/calibre/gui2/dialogs/scheduler.ui
+++ b/src/calibre/gui2/dialogs/scheduler.ui
@ -362,7 +362,7 @@
     <item>
      <widget class="QLabel" name="label_7">
       <property name="text">
-        <string>&amp;Delete downloaded news older than:</string>
+        <string>Delete downloaded news &amp;older than:</string>
       </property>
       <property name="buddy">
        <cstring>old_news</cstring>
--- a/src/calibre/gui2/viewer/javascript.py
+++ b/src/calibre/gui2/viewer/javascript.py
@ -73,6 +73,9 @@ class JavaScriptLoader(object):
            src = self.get(x)
            evaljs(src)

+        if not lang:
+            lang = 'en'
+
        def lang_name(l):
            l = l.lower()
            l = lang_as_iso639_1(l)
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -2002,7 +2002,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):

    ############# End get_categories

-    def tags_older_than(self, tag, delta, must_have_tag=None):
+    def tags_older_than(self, tag, delta, must_have_tag=None,
+            must_have_authors=None):
        '''
        Return the ids of all books having the tag ``tag`` that are older than
        than the specified time. tag comparison is case insensitive.
@ -2011,6 +2012,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        the tag are returned.
        :param must_have_tag: If not None the list of matches will be
        restricted to books that have this tag
+        :param must_have_authors: A list of authors. If not None the list of
+        matches will be restricted to books that have these authors (case
+        insensitive).
        '''
        tag = tag.lower().strip()
        mht = must_have_tag.lower().strip() if must_have_tag else None
@ -2018,9 +2022,18 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        tindex = self.FIELD_MAP['timestamp']
        gindex = self.FIELD_MAP['tags']
        iindex = self.FIELD_MAP['id']
+        aindex = self.FIELD_MAP['authors']
+        mah = must_have_authors
+        if mah is not None:
+            mah = [x.replace(',', '|').lower() for x in mah]
+            mah = ','.join(mah)
        for r in self.data._data:
            if r is not None:
                if delta is None or (now - r[tindex]) > delta:
+                    if mah:
+                        authors = r[aindex] or ''
+                        if authors.lower() != mah:
+                            continue
                    tags = r[gindex]
                    if tags:
                        tags = [x.strip() for x in tags.lower().split(',')]
@ -3205,6 +3218,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        stream.seek(0)
        mi = get_metadata(stream, format, use_libprs_metadata=False,
                force_read_metadata=True)
+        # Force the author to calibre as the auto delete of old news checks for
+        # both the author==calibre and the tag News
+        mi.authors = ['calibre']
        stream.seek(0)
        if mi.series_index is None:
            mi.series_index = self.get_next_series_num_for(mi.series)
--- a/src/calibre/utils/serve_coffee.py
+++ b/src/calibre/utils/serve_coffee.py
@ -12,7 +12,7 @@ Utilities to help with developing coffeescript based apps.
 A coffeescript compiler and a simple web server that automatically serves
 coffeescript files as javascript.
 '''
-import sys, traceback, importlib, io
+import sys, traceback, io
 if sys.version_info.major > 2:
    print('This script is not Python 3 compatible. Run it with Python 2',
            file=sys.stderr)
@ -22,125 +22,48 @@ import time, BaseHTTPServer, os, sys, re, SocketServer
 from threading import Lock
 from SimpleHTTPServer import SimpleHTTPRequestHandler

-from PyQt4.QtWebKit import QWebPage
-from PyQt4.Qt import QThread, QApplication
+from PyQt4.Qt import QCoreApplication, QScriptEngine, QScriptValue

-# Infrastructure {{{
-def import_from_calibre(mod):
-    try:
-        return importlib.import_module(mod)
-    except ImportError:
-        import init_calibre
-        init_calibre
-        return importlib.import_module(mod)
-
-_store_app = gui_thread = None
-def check_qt():
-    global gui_thread, _store_app
-    _plat = sys.platform.lower()
-    iswindows = 'win32' in _plat or 'win64' in _plat
-    isosx     = 'darwin' in _plat
-    islinux = not (iswindows or isosx)
-
-    if islinux and ':' not in os.environ.get('DISPLAY', ''):
-        raise RuntimeError('X server required. If you are running on a'
-                ' headless machine, use xvfb')
-    if _store_app is None and QApplication.instance() is None:
-        _store_app = QApplication([])
-    if gui_thread is None:
-        gui_thread = QThread.currentThread()
-    if gui_thread is not QThread.currentThread():
-        raise RuntimeError('Cannot use Qt in non GUI thread')
-
-def fork_job(*args, **kwargs):
-    try:
-        return import_from_calibre('calibre.utils.ipc.simple_worker').fork_job(*args,
-            **kwargs)
-    except ImportError:
-        # We aren't running in calibre
-        import subprocess
-        raw, filename = kwargs['args']
-        cs = ''
-        try:
-            p = subprocess.Popen([sys.executable, __file__, 'compile', '-'],
-                    stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE)
-            if isinstance(raw, unicode):
-                raw = raw.encode('utf-8')
-            stdout, stderr = p.communicate(raw)
-            cs = stdout.decode('utf-8')
-            errors = [stderr]
-        except:
-            errors = [traceback.format_exc()]
-
-        return {'result':(cs, errors)}
-
-# }}}
-
-class Compiler(QWebPage): # {{{
+class Compiler(QScriptEngine): # {{{

    '''
-    Never use this class in anything except the main thread. If you want to use
-    it from other threads, use the forked_compile method instead.
+    You can use this class in any thread, but make sure you instantiate it in
+    the main thread. Alternatively, construct a QCoreApplication in the main
+    thread, after which you can instantiate this class and use it in any
+    thread.
    '''

    def __init__(self):
-        check_qt()
-        QWebPage.__init__(self)
-        self.frame = self.mainFrame()
-        self.filename = self._src = ''
-        self.frame.evaluateJavaScript(CS_JS)
-        self.frame.addToJavaScriptWindowObject("cs_compiler", self)
-        self.errors = []
+        if QCoreApplication.instance() is None:
+            self.__app_ = QCoreApplication([])

-    def shouldInterruptJavaScript(self):
-        return True
-
-    def javaScriptConsoleMessage(self, msg, lineno, sourceid):
-        sourceid = sourceid or self.filename or '<script>'
-        self.errors.append('%s:%s'%(sourceid, msg))
-
-    def __evalcs(self, raw, filename):
-        # This method is NOT thread safe
-        self.filename = filename
-        self.setProperty('source', raw)
-        self.errors = []
-        res = self.frame.evaluateJavaScript('''
-            raw = document.getElementById("raw");
-            raw = cs_compiler.source;
-            CoffeeScript.compile(raw);
-        ''')
-        ans = ''
-        if res.type() == res.String:
-            ans = unicode(res.toString())
-        return ans, list(self.errors)
+        QScriptEngine.__init__(self)
+        res = self.evaluate(CS_JS, 'coffee-script.js')
+        if res.isError():
+            raise Exception('Failed to run the coffee script compiler: %s'%
+                    unicode(res.toString()))
+        self.lock = Lock()

    def __call__(self, raw, filename=None):
-        if not isinstance(raw, unicode):
-            raw = raw.decode('utf-8')
-        return self.__evalcs(raw, filename)
-
-def forked_compile(raw, fname):
-    # Entry point for the compile worker
-    try:
-        ans, errors = Compiler()(raw, fname)
-    except:
-        ans, errors = '', [traceback.format_exc()]
-    return ans, errors
+        with self.lock:
+            if not isinstance(raw, unicode):
+                raw = raw.decode('utf-8')
+            if not filename:
+                filename = '<string>'
+            go = self.globalObject()
+            go.setProperty('coffee_src', QScriptValue(raw),
+                    go.ReadOnly|go.Undeletable)
+            res = self.evaluate('this.CoffeeScript.compile(this.coffee_src)',
+                    filename)
+            if res.isError():
+                return '', [unicode(res.toString())]
+            return unicode(res.toString()), []


 # }}}

 def compile_coffeescript(raw, filename=None):
-    try:
-        cs, errors = fork_job('calibre.utils.serve_coffee',
-                'forked_compile', args=(raw, filename), timeout=5,
-                no_output=True)['result']
-    except Exception as e:
-        cs = None
-        errors = [getattr(e, 'orig_tb', traceback.format_exc())]
-
-    return cs, errors
+    return Compiler()(raw, filename)

 class HTTPRequestHandler(SimpleHTTPRequestHandler): # {{{
    '''
@ -317,7 +240,7 @@ class Handler(HTTPRequestHandler): # {{{
            mtime = time.time()
            with open(src, 'rb') as f:
                raw = f.read()
-            cs, errors = compile_coffeescript(raw, src)
+            cs, errors = self.compiler(raw, src)
            for line in errors:
                print(line, file=sys.stderr)
            if not cs:
@ -351,6 +274,7 @@ class Server(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer): # {{{

 def serve(resources={}, port=8000, host='0.0.0.0'):
    Handler.special_resources = resources
+    Handler.compiler = Compiler()
    httpd = Server((host, port), Handler)
    print('serving %s at %s:%d with PID=%d'%(os.getcwdu(), host, port, os.getpid()))
    try: