Merge from trunk

This commit is contained in:
Charles Haley 2012-01-17 10:52:47 +01:00
commit c68075bc08
14 changed files with 320 additions and 347 deletions

View File

@ -1,6 +1,6 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
''' '''
blic.rs blic.rs
''' '''
@ -73,7 +73,10 @@ class Blic(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url + '/print' return url + '/print'
def preprocess_html(self, soup): def get_cover_url(self):
for item in soup.findAll(style=True): soup = self.index_to_soup('http://www.blic.rs/')
del item['style'] alink = soup.find('a', attrs={'id':'blic_naslovna_print'})
return soup if alink:
return 'http://www.blic.rs' + alink['href']
return None

View File

@ -1,8 +1,15 @@
# Talking Points is not grabbing everything.
# The look is right, but only the last one added?
import re import re
import time import time
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier. # Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
# strip ads and graphics
# Current Column lacks a title.
# Talking Points Memo - shorten title - Remove year and Bill's name
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
class OReillyPremium(BasicNewsRecipe): class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium' title = u'OReilly Premium'
@ -19,7 +26,17 @@ class OReillyPremium(BasicNewsRecipe):
# Don't go down # Don't go down
recursions = 0 recursions = 0
max_articles_per_feed = 2000 max_articles_per_feed = 2000
language = 'en'
debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
@ -31,6 +48,8 @@ class OReillyPremium(BasicNewsRecipe):
br.submit() br.submit()
return br return br
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, baseURL, pageURL, printString): def extractPrintURL(self, baseURL, pageURL, printString):
tagURL = pageURL tagURL = pageURL
soup = self.index_to_soup(pageURL) soup = self.index_to_soup(pageURL)
@ -38,7 +57,6 @@ class OReillyPremium(BasicNewsRecipe):
printText = soup.find('a', text=printString) printText = soup.find('a', text=printString)
else : else :
print("Failed to find Print string "+printString+ " in "+pageURL) print("Failed to find Print string "+printString+ " in "+pageURL)
if printText: if printText:
tag = printText.parent tag = printText.parent
tagURL = baseURL+tag['href'] tagURL = baseURL+tag['href']
@ -47,177 +65,111 @@ class OReillyPremium(BasicNewsRecipe):
def stripBadChars(self, inString) : def stripBadChars(self, inString) :
return inString.replace("\'", "") return inString.replace("\'", "")
def parseGeneric(self, baseURL):
# returns a qualifying article list # Does a generic parsing of the articles. There are six categories (0-5)
def parseNoSpinArchives(self, baseURL, soupURL, debugMessages): # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
articleList = [] # NoSpin and TV are generic
soup = self.index_to_soup(soupURL) fullReturn = []
for div in soup.findAll(True, attrs={'class':['blogBody'], 'style':['padding-top:10px;']}): for i in range(len(self.catList)) :
a = div.find('a', href=True) articleList = []
if not a: soup = self.index_to_soup(self.catList[i][1])
continue # Set defaults
# re == regex. [href] is the link
url = baseURL
url +=re.sub(r'\?.*', '', a['href'])
# Get print version
printURL = self.extractPrintURL(baseURL, url, "Print this entry")
if printURL:
url = printURL
title = self.tag_to_string(a, use_alt=True).strip()
if debugMessages :
print("No Spin Archive Title:"+title+" at url: "+url)
description = 'None'
pubdate = time.strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
def parseTVArchives(self, baseURL, soupURL, debugMessages):
# TV Archives page has some Ajax, so look for the static only.
articleList = []
soup = self.index_to_soup(soupURL)
if debugMessages :
print("In parseTVArchives")
for div in soup.findAll('a', {'class':['showLinks','homeLinks']}):
a = div
url = baseURL
url +=a['href']
printURL = self.extractPrintURL(baseURL, url, "Print this entry")
if printURL:
url = printURL
title = self.tag_to_string(a, use_alt=True).strip()
title = self.stripBadChars(title)
if debugMessages :
print("TV Archive "+title+" at url: "+url)
description = 'None'
pubdate = time.strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
if debugMessages :
print("Leaving TV Parse ")
return articleList
# Get Daily Briefing Archives
def parseDailyBriefs(self, baseURL, soupURL, debugMessages) :
print("Starting daily briefs")
articleList = []
soup = self.index_to_soup(soupURL)
for div in soup.findAll(True, attrs={'class':['defaultHeaderSmallLinks']}):
# re == regex. [href] is the link
url = baseURL
url +=re.sub(r'\?.*', '', div['href'])
printURL = self.extractPrintURL(baseURL, url, "Print this entry")
if printURL:
url = printURL
title = div.contents[0]
if debugMessages :
print("Daily Brief - title:"+title+" at url: "+url)
description = 'None'
pubdate = time.strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
print("Leaving daily briefs")
return articleList
# Get the weekly Stratfor intelligence report
def parseStratfor(self, baseURL, soupURL, debugMessages):
# http://www.billoreilly.com/blog?categoryID=5
articleList = []
soup = self.index_to_soup(soupURL)
if debugMessages :
print("In parseStratfor")
a = soup.find('a', {'class':['blogLinks']})
url = baseURL
url +=a['href']
title = self.tag_to_string(a, use_alt=True).strip()
if debugMessages :
print("url: "+url)
print("title:"+title)
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
stratBody = stratSoup.find('td', {'class':['blogBody']})
if debugMessages :
print("Strat content title:"+title)
print("Strat body: "+ stratBody.contents[0])
description = 'None'
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
if debugMessages :
print("Leaving Stratfor Parse ")
return articleList
def parseTalkingPoints(self, baseURL, soupURL, debugMessages) :
# Look for blogDate. That's got the date... Then the next blogBody has the title. and then an anchor with class "homeBlogReadMore bold" has the URL.
articleList = []
soup = self.index_to_soup(soupURL)
if debugMessages :
print("Starting Talking Points")
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
# tableParent = topDate.parent.parent
myTable = topDate.findParents('table')[0]
upOneTable = myTable.findParents('table')[0]
upTwo = upOneTable.findParents('table')[0]
# Now navigate rows of upTwo
if debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
# re == regex. [href] is the link
url = baseURL
url +=re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
description = 'None' description = 'None'
pubdate = time.strftime('%a, %d %b') pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) # Problem: 0-2 create many in an array
print("Exiting parseTalkingPoints\n") # 3-5 create one.
return articleList # So no for-div for 3-5
def parseCurrentColumn(self, baseURL, soupURL, debugMessages) : if i < 3 :
# Only needed to get the column title. Otherwise it's all good already; there's only one column for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
articleList = [] print(div)
soup = self.index_to_soup(soupURL) if i == 1:
titleSpan = soup.find('span', {'class':['defaultHeader']}) a = div.find('a', href=True)
title = titleSpan.contents[0] else :
# Get Print URL since it's available a = div
printURL = self.extractPrintURL(baseURL, soupURL, "Print This Article") print(a)
if printURL: summary = div.find(True, attrs={'class':'summary'})
print("Found print URL") if summary:
url = printURL description = self.tag_to_string(summary, use_alt=False)
if debugMessages : if not a:
print("url: "+url) continue
print("title:"+title) # url = baseURL+re.sub(r'\?.*', '', a['href'])
description = 'None' url = baseURL+a['href']
pubdate = time.strftime('%a, %d %b') if i < 2 :
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) url = self.extractPrintURL(baseURL, url, "Print this entry")
if debugMessages : title = self.tag_to_string(a, use_alt=True).strip()
print("Leaving Stratfor Parse ") elif i == 2 :
return articleList # Daily Briefs
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = div.contents[0]
if self.debugMessages :
print(title+" @ "+url)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
elif i == 3 : # Stratfor
a = soup.find('a', self.catList[i][3])
if a is None :
continue
url = baseURL+a['href']
title = self.tag_to_string(a, use_alt=True).strip()
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
# Changed 12 Jan 2012 - new page format
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
#stratBody = stratSoup.find('td', {'class':['blogBody']})
elif i == 4 : # Talking Points
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
myTable = topDate.findParents('table')[0]
if myTable is not None:
upOneTable = myTable.findParents('table')[0]
if upOneTable is not None:
upTwo = upOneTable.findParents('table')[0]
if upTwo is None:
continue
# Now navigate rows of upTwo
if self.debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
# This looks wrong.
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
url = baseURL+re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if self.debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
else : # Current Column
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
if titleSpan is None :
continue
title = titleSpan.contents[0]
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
if i == 3 or i == 5 :
if self.debugMessages :
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
self.catList[i][3] = articleList
fullReturn.append((self.catList[i][0], articleList))
return fullReturn
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles) # returns a list of tuple ('feed title', list of articles)
@ -231,27 +183,8 @@ class OReillyPremium(BasicNewsRecipe):
# this is used instead of BasicNewsRecipe.parse_feeds(). # this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self): def parse_index(self):
# Parse the page into Python Soup # Parse the page into Python Soup
debugMessages = True
baseURL = "https://www.billoreilly.com" baseURL = "https://www.billoreilly.com"
def feed_title(div): return self.parseGeneric(baseURL)
return ''.join(div.findAll(text=True, recursive=False)).strip()
# [] is list, {} is empty mapping.
articleList = []
ans = []
showList = self.parseTVArchives(baseURL, 'https://www.billoreilly.com/show?action=tvShowArchive', debugMessages)
articleList = self.parseNoSpinArchives(baseURL, 'https://www.billoreilly.com/blog?categoryID=7', debugMessages)
stratList = self.parseStratfor(baseURL, 'http://www.billoreilly.com/blog?categoryID=5', debugMessages)
dailyBriefs = self.parseDailyBriefs(baseURL, 'http://www.billoreilly.com/blog?categoryID=11', debugMessages)
talkingPoints = self.parseTalkingPoints(baseURL, 'https://www.billoreilly.com/blog?categoryID=12', debugMessages)
currentColumn = self.parseCurrentColumn(baseURL, 'https://www.billoreilly.com/currentcolumn', debugMessages)
# Below, { x:y, a:b } creates a dictionary. We return a tuple of a title and list of dict...
# Lists are constructed with square brackets, separating items with commas: [a, b, c]. Tuples are constructed by the comma operator (not within square brackets), with or without enclosing parentheses, but an empty tuple must have the enclosing parentheses, such as a, b, c or (). A single item tuple must have a trailing comma, such as (d,).
# Shows first two if talking points and no spin news. Also if they are TV Shows ande Stratfor Weekly, also if Daily Briefing and Curren Column
# So all work individually. No idea why only getting first two in TOC now.
ans = [("Talking Points Memos", talkingPoints),("No Spin News", articleList),("TV Shows", showList),("Stratfor Weekly",stratList), ("Daily Briefing", dailyBriefs),("Current Column", currentColumn)]
if debugMessages :
print ans
return ans
def preprocess_html(self, soup): def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'}) refresh = soup.find('meta', {'http-equiv':'refresh'})

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2012, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.variety.com www.variety.com
''' '''
@ -14,11 +14,11 @@ class Variety(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'utf8'
publisher = 'Red Business Information' publisher = 'Red Business Information'
category = 'Entertainment Industry News, Daily Variety, Movie Reviews, TV, Awards, Oscars, Cannes, Box Office, Hollywood' category = 'Entertainment Industry News, Daily Variety, Movie Reviews, TV, Awards, Oscars, Cannes, Box Office, Hollywood'
language = 'en' language = 'en'
masthead_url = 'http://a330.g.akamai.net/7/330/23382/20090528190853/www.variety.com/graphics/variety/Variety_logo_green_tm.gif' masthead_url = 'http://images1.variety.com/graphics/variety/Variety_logo_green_tm.gif'
extra_css = ' body{font-family: Georgia,"Times New Roman",Times,Courier,serif } img{margin-bottom: 1em} ' extra_css = ' body{font-family: Georgia,"Times New Roman",Times,Courier,serif } img{margin-bottom: 1em} '
conversion_options = { conversion_options = {
@ -30,17 +30,10 @@ class Variety(BasicNewsRecipe):
remove_tags = [dict(name=['object','link','map'])] remove_tags = [dict(name=['object','link','map'])]
keep_only_tags = [dict(name='div', attrs={'id':'article'})] keep_only_tags = [dict(name='div', attrs={'class':'art control'})]
feeds = [(u'News & Articles', u'http://feeds.feedburner.com/variety/headlines' )] feeds = [(u'News & Articles', u'http://feeds.feedburner.com/variety/headlines' )]
def print_version(self, url): def print_version(self, url):
rpt = url.rpartition('?')[0] rpt = url.rpartition('.html')[0]
artid = rpt.rpartition('/')[2] return rpt + '?printerfriendly=true'
catidr = url.rpartition('categoryid=')[2]
catid = catidr.partition('&')[0]
return 'http://www.variety.com/index.asp?layout=print_story&articleid=' + artid + '&categoryid=' + catid
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,46 @@
#!/usr/bin/env python
import re
from calibre.web.feeds.news import BasicNewsRecipe
class VillageVoice(BasicNewsRecipe):
title = 'Village Voice'
feeds = [
("Complete Issue", "http://villagevoice.com/syndication/issue"),
("News", "http://villagevoice.com/syndication/section/news"),
("Music", "http://villagevoice.com/syndication/section/music"),
("Movies", "http://villagevoice.com/syndication/section/film"),
#("Restaurants", "http://villagevoice.com/syndication/section/dining"),
#("Music Events", "http://villagevoice.com/syndication/events?type=music"),
#("Calendar Events", "http://villagevoice.com/syndication/events"),
#("Promotional Events", "http://villagevoice.com/syndication/promoEvents"),
#("Restaurant Guide", "http://villagevoice.com/syndication/restaurants/search")
]
auto_cleanup = True
max_articles_per_feed = 50
masthead_url = "http://assets.villagevoice.com/img/citylogo.png"
language = 'en'
__author__ = 'Barty'
seen_urls = []
# village voice breaks the article up into multiple pages, so
# parse page and grab the print url
url_regex = re.compile(r'\/content\/printVersion\/\d+',re.I)
def print_version(self, url):
if url in self.seen_urls:
return None
self.seen_urls.append( url)
soup = self.index_to_soup(url)
atag = soup.find('a',attrs={'href':self.url_regex})
if atag is None:
self.log('Warning: no print url found for '+url)
else:
m = self.url_regex.search(atag['href'])
if m:
url = 'http://www.villagevoice.com'+m.group(0)
return url

View File

@ -197,7 +197,7 @@ title_series_sorting = 'library_order'
# For example, if the tweak is set to library_order, "The Lord of the Rings" # For example, if the tweak is set to library_order, "The Lord of the Rings"
# will become "Lord of the Rings, The". If the tweak is set to # will become "Lord of the Rings, The". If the tweak is set to
# strictly_alphabetic, it would remain "The Lord of the Rings". Note that the # strictly_alphabetic, it would remain "The Lord of the Rings". Note that the
# formatter function raw_field will return the base value for title and # formatter function raw_field will return the base value for title and
# series regardless of the setting of this tweak. # series regardless of the setting of this tweak.
save_template_title_series_sorting = 'library_order' save_template_title_series_sorting = 'library_order'

View File

@ -13,6 +13,7 @@
3. Much more comprehensive testing/error handling 3. Much more comprehensive testing/error handling
4. Properly encodes/decodes assertions 4. Properly encodes/decodes assertions
5. Handles points in the padding of elements consistently 5. Handles points in the padding of elements consistently
6. Has a utility method to calculate the CFI for the current viewport position robustly
To check if this script is compatible with the current browser, call To check if this script is compatible with the current browser, call
window.cfi.is_compatible() it will throw an exception if not compatible. window.cfi.is_compatible() it will throw an exception if not compatible.
@ -72,7 +73,7 @@ get_current_time = (target) -> # {{{
fstr(ans) fstr(ans)
# }}} # }}}
window_scroll_pos = (win) -> # {{{ window_scroll_pos = (win=window) -> # {{{
if typeof(win.pageXOffset) == 'number' if typeof(win.pageXOffset) == 'number'
x = win.pageXOffset x = win.pageXOffset
y = win.pageYOffset y = win.pageYOffset
@ -86,18 +87,18 @@ window_scroll_pos = (win) -> # {{{
return [x, y] return [x, y]
# }}} # }}}
viewport_to_document = (x, y, doc) -> # {{{ viewport_to_document = (x, y, doc=window?.document) -> # {{{
until doc == window.document
# We are in a frame
frame = doc.defaultView.frameElement
rect = frame.getBoundingClientRect()
x += rect.left
y += rect.top
doc = frame.ownerDocument
win = doc.defaultView win = doc.defaultView
[wx, wy] = window_scroll_pos(win) [wx, wy] = window_scroll_pos(win)
x += wx x += wx
y += wy y += wy
if doc != window.document
# We are in a frame
node = win.frameElement
rect = node.getBoundingClientRect()
[vx, vy] = viewport_to_document(rect.left, rect.top, node.ownerDocument)
x += vx
y += vy
return [x, y] return [x, y]
# }}} # }}}
@ -157,7 +158,8 @@ class CanonicalFragmentIdentifier
is_compatible(): Throws an error if the browser is not compatible with is_compatible(): Throws an error if the browser is not compatible with
this script this script
at(x, y): which maps a point to a CFI, if possible at(x, y): Maps a point to a CFI, if possible
at_current(): Returns the CFI corresponding to the current viewport scroll location
scroll_to(cfi): which scrolls the browser to a point corresponding to the scroll_to(cfi): which scrolls the browser to a point corresponding to the
given cfi, and returns the x and y co-ordinates of the point. given cfi, and returns the x and y co-ordinates of the point.
@ -397,6 +399,8 @@ class CanonicalFragmentIdentifier
if not cd if not cd
break break
# We have an embedded document, transforms x, y into the co-prd
# system of the embedded document's viewport
rect = target.getBoundingClientRect() rect = target.getBoundingClientRect()
x -= rect.left x -= rect.left
y -= rect.top y -= rect.top
@ -557,11 +561,73 @@ class CanonicalFragmentIdentifier
null null
# }}} # }}}
current_cfi: () -> # {{{ at_current: () -> # {{{
[winx, winy] = window_scroll_pos() [winx, winy] = window_scroll_pos()
[winw, winh] = [window.innerWidth, window.innerHeight] [winw, winh] = [window.innerWidth, window.innerHeight]
max = Math.max
winw = max(winw, 400) winw = max(winw, 400)
winh = max(winh, 600) winh = max(winh, 600)
deltay = Math.floor(winh/50)
deltax = Math.floor(winw/25)
miny = max(-winy, -winh)
maxy = winh
minx = max(-winx, -winw)
maxx = winw
dist = (p1, p2) ->
Math.sqrt(Math.pow(p1[0]-p2[0], 2), Math.pow(p1[1]-p2[1], 2))
get_cfi = (ox, oy) ->
try
cfi = this.at(ox, oy)
point = this.point(cfi)
catch err
cfi = null
if point.range != null
r = point.range
rect = r.getClientRects()[0]
x = (point.a*rect.left + (1-point.a)*rect.right)
y = (rect.top + rect.bottom)/2
[x, y] = viewport_to_document(x, y, r.startContainer.ownerDocument)
else
node = point.node
r = node.getBoundingClientRect()
[x, y] = viewport_to_document(r.left, r.top, node.ownerDocument)
if typeof(point.x) == 'number' and node.offsetWidth
x += (point.x*node.offsetWidth)/100
if typeof(point.y) == 'number' and node.offsetHeight
y += (point.y*node.offsetHeight)/100
if dist(viewport_to_document(ox, oy), [x, y]) > 50
cfi = null
return cfi
x_loop = (cury) ->
for direction in [-1, 1]
delta = deltax * direction
curx = 0
until (direction < 0 and curx < minx) or (direction > 0 and curx > maxx)
cfi = get_cfi(curx, cury)
if cfi
return cfi
curx += delta
null
for direction in [-1, 1]
delta = deltay * direction
cury = 0
until (direction < 0 and cury < miny) or (direction > 0 and cury > maxy)
cfi = x_loop(cury, -1)
if cfi
return cfi
cury += delta
# TODO: Return the CFI corresponding to the <body> tag
null
# }}} # }}}
if window? if window?

View File

@ -59,26 +59,13 @@ mark_and_reload = (evt) ->
setTimeout(fn, 1) setTimeout(fn, 1)
null null
window_scroll_pos = (win) ->
if typeof(win.pageXOffset) == 'number'
x = win.pageXOffset
y = win.pageYOffset
else # IE < 9
if document.body and ( document.body.scrollLeft or document.body.scrollTop )
x = document.body.scrollLeft
y = document.body.scrollTop
else if document.documentElement and ( document.documentElement.scrollLeft or document.documentElement.scrollTop)
y = document.documentElement.scrollTop
x = document.documentElement.scrollLeft
return [x, y]
frame_clicked = (evt) -> frame_clicked = (evt) ->
iframe = evt.target.ownerDocument.defaultView.frameElement iframe = evt.target.ownerDocument.defaultView.frameElement
# We know that the offset parent of the iframe is body # We know that the offset parent of the iframe is body
# So we can easily calculate the event co-ords w.r.t. the browser window # So we can easily calculate the event co-ords w.r.t. the browser window
[winx, winy] = window_scroll_pos(window) rect = iframe.getBoundingClientRect()
x = evt.clientX + iframe.offsetLeft - winx x = evt.clientX + rect.left
y = evt.clientY + iframe.offsetTop - winy y = evt.clientY + rect.top
mark_and_reload({'clientX':x, 'clientY':y, 'button':evt.button}) mark_and_reload({'clientX':x, 'clientY':y, 'button':evt.button})
window.onload = -> window.onload = ->

View File

@ -23,6 +23,7 @@
indignation and dislike men who are so beguiled and demoralized by indignation and dislike men who are so beguiled and demoralized by
the charms of pleasure of the moment, so blinded by desire, that the charms of pleasure of the moment, so blinded by desire, that
they cannot foresee</p> they cannot foresee</p>
<p><img src="marker.png" width="300" height="300" alt="Test image"/></p>
</body> </body>
</html> </html>

View File

@ -1,7 +1,7 @@
<!DOCTYPE html> <!DOCTYPE html>
<html> <html>
<head> <head>
<title>Testing EPUB CFI</title> <title>Testing cfi.coffee</title>
<script type="text/javascript" src="cfi.coffee"></script> <script type="text/javascript" src="cfi.coffee"></script>
<script type="text/javascript" src="cfi-test.coffee"></script> <script type="text/javascript" src="cfi-test.coffee"></script>
<style type="text/css"> <style type="text/css">
@ -46,7 +46,8 @@
</head> </head>
<body> <body>
<div id="container"> <div id="container">
<h1 id="first-h1">Testing EPUB CFI</h1> <h1 id="first-h1">Testing cfi.coffee</h1>
<p>Click anywhere and the location will be marked with a marker, whose position is set via a CFI.</p>
<p><a id="reset" href="/">Reset CFI to None</a></p> <p><a id="reset" href="/">Reset CFI to None</a></p>
<h2>A div with scrollbars</h2> <h2>A div with scrollbars</h2>
<p>Scroll down and click on some elements. Make sure to hit both <p>Scroll down and click on some elements. Make sure to hit both

View File

@ -462,7 +462,7 @@ class Scheduler(QObject):
delta = timedelta(days=self.oldest) delta = timedelta(days=self.oldest)
try: try:
ids = list(self.db.tags_older_than(_('News'), ids = list(self.db.tags_older_than(_('News'),
delta)) delta, must_have_authors=['calibre']))
except: except:
# Happens if library is being switched # Happens if library is being switched
ids = [] ids = []

View File

@ -362,7 +362,7 @@
<item> <item>
<widget class="QLabel" name="label_7"> <widget class="QLabel" name="label_7">
<property name="text"> <property name="text">
<string>&amp;Delete downloaded news older than:</string> <string>Delete downloaded news &amp;older than:</string>
</property> </property>
<property name="buddy"> <property name="buddy">
<cstring>old_news</cstring> <cstring>old_news</cstring>

View File

@ -73,6 +73,9 @@ class JavaScriptLoader(object):
src = self.get(x) src = self.get(x)
evaljs(src) evaljs(src)
if not lang:
lang = 'en'
def lang_name(l): def lang_name(l):
l = l.lower() l = l.lower()
l = lang_as_iso639_1(l) l = lang_as_iso639_1(l)

View File

@ -2002,7 +2002,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
############# End get_categories ############# End get_categories
def tags_older_than(self, tag, delta, must_have_tag=None): def tags_older_than(self, tag, delta, must_have_tag=None,
must_have_authors=None):
''' '''
Return the ids of all books having the tag ``tag`` that are older than Return the ids of all books having the tag ``tag`` that are older than
than the specified time. tag comparison is case insensitive. than the specified time. tag comparison is case insensitive.
@ -2011,6 +2012,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
the tag are returned. the tag are returned.
:param must_have_tag: If not None the list of matches will be :param must_have_tag: If not None the list of matches will be
restricted to books that have this tag restricted to books that have this tag
:param must_have_authors: A list of authors. If not None the list of
matches will be restricted to books that have these authors (case
insensitive).
''' '''
tag = tag.lower().strip() tag = tag.lower().strip()
mht = must_have_tag.lower().strip() if must_have_tag else None mht = must_have_tag.lower().strip() if must_have_tag else None
@ -2018,9 +2022,18 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
tindex = self.FIELD_MAP['timestamp'] tindex = self.FIELD_MAP['timestamp']
gindex = self.FIELD_MAP['tags'] gindex = self.FIELD_MAP['tags']
iindex = self.FIELD_MAP['id'] iindex = self.FIELD_MAP['id']
aindex = self.FIELD_MAP['authors']
mah = must_have_authors
if mah is not None:
mah = [x.replace(',', '|').lower() for x in mah]
mah = ','.join(mah)
for r in self.data._data: for r in self.data._data:
if r is not None: if r is not None:
if delta is None or (now - r[tindex]) > delta: if delta is None or (now - r[tindex]) > delta:
if mah:
authors = r[aindex] or ''
if authors.lower() != mah:
continue
tags = r[gindex] tags = r[gindex]
if tags: if tags:
tags = [x.strip() for x in tags.lower().split(',')] tags = [x.strip() for x in tags.lower().split(',')]
@ -3205,6 +3218,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
stream.seek(0) stream.seek(0)
mi = get_metadata(stream, format, use_libprs_metadata=False, mi = get_metadata(stream, format, use_libprs_metadata=False,
force_read_metadata=True) force_read_metadata=True)
# Force the author to calibre as the auto delete of old news checks for
# both the author==calibre and the tag News
mi.authors = ['calibre']
stream.seek(0) stream.seek(0)
if mi.series_index is None: if mi.series_index is None:
mi.series_index = self.get_next_series_num_for(mi.series) mi.series_index = self.get_next_series_num_for(mi.series)

View File

@ -12,7 +12,7 @@ Utilities to help with developing coffeescript based apps.
A coffeescript compiler and a simple web server that automatically serves A coffeescript compiler and a simple web server that automatically serves
coffeescript files as javascript. coffeescript files as javascript.
''' '''
import sys, traceback, importlib, io import sys, traceback, io
if sys.version_info.major > 2: if sys.version_info.major > 2:
print('This script is not Python 3 compatible. Run it with Python 2', print('This script is not Python 3 compatible. Run it with Python 2',
file=sys.stderr) file=sys.stderr)
@ -22,125 +22,48 @@ import time, BaseHTTPServer, os, sys, re, SocketServer
from threading import Lock from threading import Lock
from SimpleHTTPServer import SimpleHTTPRequestHandler from SimpleHTTPServer import SimpleHTTPRequestHandler
from PyQt4.QtWebKit import QWebPage from PyQt4.Qt import QCoreApplication, QScriptEngine, QScriptValue
from PyQt4.Qt import QThread, QApplication
# Infrastructure {{{ class Compiler(QScriptEngine): # {{{
def import_from_calibre(mod):
try:
return importlib.import_module(mod)
except ImportError:
import init_calibre
init_calibre
return importlib.import_module(mod)
_store_app = gui_thread = None
def check_qt():
global gui_thread, _store_app
_plat = sys.platform.lower()
iswindows = 'win32' in _plat or 'win64' in _plat
isosx = 'darwin' in _plat
islinux = not (iswindows or isosx)
if islinux and ':' not in os.environ.get('DISPLAY', ''):
raise RuntimeError('X server required. If you are running on a'
' headless machine, use xvfb')
if _store_app is None and QApplication.instance() is None:
_store_app = QApplication([])
if gui_thread is None:
gui_thread = QThread.currentThread()
if gui_thread is not QThread.currentThread():
raise RuntimeError('Cannot use Qt in non GUI thread')
def fork_job(*args, **kwargs):
try:
return import_from_calibre('calibre.utils.ipc.simple_worker').fork_job(*args,
**kwargs)
except ImportError:
# We aren't running in calibre
import subprocess
raw, filename = kwargs['args']
cs = ''
try:
p = subprocess.Popen([sys.executable, __file__, 'compile', '-'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
stdout, stderr = p.communicate(raw)
cs = stdout.decode('utf-8')
errors = [stderr]
except:
errors = [traceback.format_exc()]
return {'result':(cs, errors)}
# }}}
class Compiler(QWebPage): # {{{
''' '''
Never use this class in anything except the main thread. If you want to use You can use this class in any thread, but make sure you instantiate it in
it from other threads, use the forked_compile method instead. the main thread. Alternatively, construct a QCoreApplication in the main
thread, after which you can instantiate this class and use it in any
thread.
''' '''
def __init__(self): def __init__(self):
check_qt() if QCoreApplication.instance() is None:
QWebPage.__init__(self) self.__app_ = QCoreApplication([])
self.frame = self.mainFrame()
self.filename = self._src = ''
self.frame.evaluateJavaScript(CS_JS)
self.frame.addToJavaScriptWindowObject("cs_compiler", self)
self.errors = []
def shouldInterruptJavaScript(self): QScriptEngine.__init__(self)
return True res = self.evaluate(CS_JS, 'coffee-script.js')
if res.isError():
def javaScriptConsoleMessage(self, msg, lineno, sourceid): raise Exception('Failed to run the coffee script compiler: %s'%
sourceid = sourceid or self.filename or '<script>' unicode(res.toString()))
self.errors.append('%s:%s'%(sourceid, msg)) self.lock = Lock()
def __evalcs(self, raw, filename):
# This method is NOT thread safe
self.filename = filename
self.setProperty('source', raw)
self.errors = []
res = self.frame.evaluateJavaScript('''
raw = document.getElementById("raw");
raw = cs_compiler.source;
CoffeeScript.compile(raw);
''')
ans = ''
if res.type() == res.String:
ans = unicode(res.toString())
return ans, list(self.errors)
def __call__(self, raw, filename=None): def __call__(self, raw, filename=None):
if not isinstance(raw, unicode): with self.lock:
raw = raw.decode('utf-8') if not isinstance(raw, unicode):
return self.__evalcs(raw, filename) raw = raw.decode('utf-8')
if not filename:
def forked_compile(raw, fname): filename = '<string>'
# Entry point for the compile worker go = self.globalObject()
try: go.setProperty('coffee_src', QScriptValue(raw),
ans, errors = Compiler()(raw, fname) go.ReadOnly|go.Undeletable)
except: res = self.evaluate('this.CoffeeScript.compile(this.coffee_src)',
ans, errors = '', [traceback.format_exc()] filename)
return ans, errors if res.isError():
return '', [unicode(res.toString())]
return unicode(res.toString()), []
# }}} # }}}
def compile_coffeescript(raw, filename=None): def compile_coffeescript(raw, filename=None):
try: return Compiler()(raw, filename)
cs, errors = fork_job('calibre.utils.serve_coffee',
'forked_compile', args=(raw, filename), timeout=5,
no_output=True)['result']
except Exception as e:
cs = None
errors = [getattr(e, 'orig_tb', traceback.format_exc())]
return cs, errors
class HTTPRequestHandler(SimpleHTTPRequestHandler): # {{{ class HTTPRequestHandler(SimpleHTTPRequestHandler): # {{{
''' '''
@ -317,7 +240,7 @@ class Handler(HTTPRequestHandler): # {{{
mtime = time.time() mtime = time.time()
with open(src, 'rb') as f: with open(src, 'rb') as f:
raw = f.read() raw = f.read()
cs, errors = compile_coffeescript(raw, src) cs, errors = self.compiler(raw, src)
for line in errors: for line in errors:
print(line, file=sys.stderr) print(line, file=sys.stderr)
if not cs: if not cs:
@ -351,6 +274,7 @@ class Server(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer): # {{{
def serve(resources={}, port=8000, host='0.0.0.0'): def serve(resources={}, port=8000, host='0.0.0.0'):
Handler.special_resources = resources Handler.special_resources = resources
Handler.compiler = Compiler()
httpd = Server((host, port), Handler) httpd = Server((host, port), Handler)
print('serving %s at %s:%d with PID=%d'%(os.getcwdu(), host, port, os.getpid())) print('serving %s at %s:%d with PID=%d'%(os.getcwdu(), host, port, os.getpid()))
try: try: