mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
merging with trunk
This commit is contained in:
commit
07e888f764
145
Changelog.yaml
145
Changelog.yaml
@ -4,6 +4,151 @@
|
||||
# for important features/bug fixes.
|
||||
# Also, each release can have new and improved recipes.
|
||||
|
||||
- version: 0.6.44
|
||||
date: 2010-03-05
|
||||
|
||||
new features:
|
||||
- title: "Experimental support for conversion of CHM files"
|
||||
type: major
|
||||
description : >
|
||||
"Conversion and reading of metadata from CHM files is now supported. This feature is
|
||||
still experimental, with more testing needed. Building from source on linux now
|
||||
requires chmlib."
|
||||
|
||||
- title: "Experimental support for fetching annotations from the Kindles"
|
||||
type: major
|
||||
description: >
|
||||
"calibre can now fetch annotations from your kindle and put them into the
|
||||
comments field. To fetch annotations, click the arrow next to the
|
||||
'send to device' button and select 'Fetch Annotations', with your Kindle
|
||||
connected."
|
||||
|
||||
- title: "Support FreeBSD out of the box (except USB)"
|
||||
type: major
|
||||
tickets: [4715]
|
||||
|
||||
|
||||
- title: "News download scheduler: Don't try to download news when no active internet connection is present (linux/windows only)"
|
||||
|
||||
- title: "EPUB to WPUB conversion: Preserve font encryption"
|
||||
|
||||
- title: "calibre-server: Add --pidfile and --daemonize (unix only) options"
|
||||
|
||||
- title: "Plugins: When loading a plugin zip file extract to temp dir and add to sys.path, if the zip file contains binay code (pyd/dll/so/dylib), instead of just adding the zip file to the path, as python cannot load compiled code from a zip file"
|
||||
|
||||
|
||||
|
||||
bug fixes:
|
||||
- title: "Ebook-viewer: Handle non-ascii CSS files when doing font substitutions"
|
||||
|
||||
- title: "Conversion pipline: Ignore non-integral play orders when parsing NCX files"
|
||||
|
||||
- title: "When decoding NCX toc files, if no encoding is declared and detection has less that 100% confidence, assume UTF-8."
|
||||
tickets: [5039]
|
||||
|
||||
- title: "PML chapter definitions missing from toc.ncx"
|
||||
tickets: [4990]
|
||||
|
||||
- title: "Unicode string for cover causes calibredb --output-format stanza to fail"
|
||||
ticket: [5035]
|
||||
|
||||
- title: "Search cover:False fails, cover:True succeeds"
|
||||
tickets: [5034]
|
||||
|
||||
- title: "Plugins: correctly use context"
|
||||
|
||||
- title: "MOBI Input: Don't lose cover if it is also referred to in main text"
|
||||
ticket: [5020]
|
||||
|
||||
- title: "RTF Output: Don't choke on PNG images"
|
||||
|
||||
new recipes:
|
||||
- title: Journal of Hospital Medicine, San Francisco Bay Guardian, Smithsonian Magazine
|
||||
author: Krittika Goyal
|
||||
|
||||
- title: Astronomy Pick of the Day, Epicurious
|
||||
author: Starson17
|
||||
|
||||
- title: Diario Vasco, Various Chilean newspapers
|
||||
author: Darko Miletic
|
||||
|
||||
- title: Kukuburi
|
||||
author: Mori
|
||||
|
||||
|
||||
improved recipes:
|
||||
- Ars Technica
|
||||
- Fudzilla
|
||||
- The Atlantic
|
||||
- The Economist
|
||||
- Huffington Post
|
||||
|
||||
- version: 0.6.43
|
||||
date: 2010-02-26
|
||||
|
||||
new features:
|
||||
- title: "Support for the Teclast K3 and Elonex e-book readers"
|
||||
|
||||
- title: "Add 'Recently Read' category to catalog if Kindle is connected when catalog is generated"
|
||||
|
||||
- title: "When adding PRC/MOBI files that are actually Topaz files, change detected file type to Topaz"
|
||||
|
||||
- title: "MOBI Output: If the SVG rasterizer is not avaialbale continue anyway"
|
||||
|
||||
- title: "News download: When using the debug pipeline options, create a zip file named periodical.downloaded_recipe in the debug directory. This can be passed to ebook-convert to directly convert a previous download into an e-book."
|
||||
|
||||
- title: "Add Apply button to catalog generation dialog"
|
||||
|
||||
bug fixes:
|
||||
- title: "When fetching metadata in the edit metadata dialog, use a python thread instead of a Qt thread. Hopefully this will fix the reports of crashes when fetching metadata"
|
||||
|
||||
- title: "Refresh cover browser when a cover is changed via the edit meta information dialog"
|
||||
|
||||
- title: "More device detection debug output on OS X"
|
||||
|
||||
- title: "Download only covers should not also set social metadata"
|
||||
tickets: [4966]
|
||||
|
||||
- title: "Content server: If fail to bind to 0.0.0.0 try detecting and binding only to interface used for outgoing traffic"
|
||||
|
||||
- title: "Handle poorly designed import plugins that return None on error"
|
||||
|
||||
- title: "Move logic for removing inline navbars out of the BasicNewsRecipe class"
|
||||
|
||||
- title: "MOBI metadata: When setting title, set in both PalmDoc and EXTH headers"
|
||||
|
||||
- title: "MOBI metadata: Do not try to extarct embedded metadata from MOBI files larger than 4MB"
|
||||
|
||||
- title: "Handle PDB files that contain PDF files"
|
||||
tickets: [4971]
|
||||
|
||||
- title: "PML Input: Various fixes"
|
||||
tickets: [4959,4961]
|
||||
|
||||
- title: "Fix reading MOBI metadata from files in zip/rar archives"
|
||||
|
||||
- title: "Make extracting single files from RAR archives more efficient"
|
||||
|
||||
- title: "No longer need Qt to generate default cover for news downloads"
|
||||
|
||||
- title: "Catalog generation: fix for EPUB anchors beginning with numbers in Recently Added"
|
||||
|
||||
- title: "Searching: Handle uppercase keywords correctly"
|
||||
tickets: [4951]
|
||||
|
||||
|
||||
new recipes:
|
||||
- title: Gamasutra
|
||||
author: Darko Miletic
|
||||
|
||||
improved recipes:
|
||||
- "Strategy+Business"
|
||||
- Arizona Daily Star
|
||||
- Heise
|
||||
- New Scientist
|
||||
- Various Serbian news feeds
|
||||
- Houston and San Francisco Chronicles
|
||||
|
||||
- version: 0.6.42
|
||||
date: 2010-02-20
|
||||
|
||||
|
@ -79,9 +79,24 @@ p.unread_book {
|
||||
text-indent:-2em;
|
||||
}
|
||||
|
||||
p.date_read {
|
||||
text-align:left;
|
||||
margin-top:0px;
|
||||
margin-bottom:0px;
|
||||
margin-left:6em;
|
||||
text-indent:-6em;
|
||||
}
|
||||
|
||||
hr.series_divider {
|
||||
width:50%;
|
||||
margin-left:1em;
|
||||
margin-top:0em;
|
||||
margin-bottom:0em;
|
||||
}
|
||||
|
||||
hr.annotations_divider {
|
||||
width:50%;
|
||||
margin-left:1em;
|
||||
margin-top:0em;
|
||||
margin-bottom:0em;
|
||||
}
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 116 KiB After Width: | Height: | Size: 124 KiB |
BIN
resources/images/news/diariovasco.png
Normal file
BIN
resources/images/news/diariovasco.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 766 B |
BIN
resources/images/news/gamasutra_fa.png
Normal file
BIN
resources/images/news/gamasutra_fa.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 956 B |
BIN
resources/images/news/gamasutra_news.png
Normal file
BIN
resources/images/news/gamasutra_news.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 956 B |
@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
|
||||
'''
|
||||
24sata.rs
|
||||
@ -9,7 +8,6 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
class Ser24Sata(BasicNewsRecipe):
|
||||
title = '24 Sata - Sr'
|
||||
@ -17,22 +15,20 @@ class Ser24Sata(BasicNewsRecipe):
|
||||
description = '24 sata portal vesti iz Srbije'
|
||||
publisher = 'Ringier d.o.o.'
|
||||
category = 'news, politics, entertainment, Serbia'
|
||||
oldest_article = 7
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
language = 'sr'
|
||||
|
||||
lang = 'sr-Latn-RS'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
, 'pretty_print' : True
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
@ -40,25 +36,6 @@ class Ser24Sata(BasicNewsRecipe):
|
||||
feeds = [(u'Vesti Dana', u'http://www.24sata.rs/rss.php')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['xml:lang'] = self.lang
|
||||
soup.html['lang'] = self.lang
|
||||
|
||||
attribs = [ 'style','font','valign'
|
||||
,'colspan','width','height'
|
||||
,'rowspan','summary','align'
|
||||
,'cellspacing','cellpadding'
|
||||
,'frames','rules','border'
|
||||
]
|
||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||
item.name = 'div'
|
||||
for attrib in attribs:
|
||||
if item.has_key(attrib):
|
||||
del item[attrib]
|
||||
|
||||
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
|
||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
||||
soup.head.insert(0,mlang)
|
||||
soup.head.insert(1,mcharset)
|
||||
return self.adeify_images(soup)
|
||||
|
||||
def print_version(self, url):
|
||||
|
37
resources/recipes/apod.recipe
Normal file
37
resources/recipes/apod.recipe
Normal file
@ -0,0 +1,37 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class APOD(BasicNewsRecipe):
|
||||
title = u'Astronomy Picture of the Day'
|
||||
__author__ = 'Starson17'
|
||||
description = 'Astronomy Pictures'
|
||||
language = 'en'
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
cover_url = 'http://apod.nasa.gov/apod/image/1003/m78_torregrosa.jpg'
|
||||
remove_javascript = True
|
||||
recursions = 0
|
||||
oldest_article = 14
|
||||
|
||||
feeds = [
|
||||
(u'Astronomy Picture of the Day', u'http://apod.nasa.gov/apod.rss')
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
center_tags = soup.findAll(['center'])
|
||||
p_tags = soup.findAll(['p'])
|
||||
last_center = center_tags[-1:]
|
||||
last_center[0].extract()
|
||||
first_p = p_tags[:1]
|
||||
for tag in first_p:
|
||||
tag.extract()
|
||||
last2_p = p_tags[-2:]
|
||||
for tag in last2_p:
|
||||
tag.extract()
|
||||
return soup
|
||||
|
@ -5,6 +5,7 @@ __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
arstechnica.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
|
||||
@ -20,7 +21,7 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
extra_css = ' body {font-family: sans-serif} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
|
||||
extra_css = ' body {font-family: Arial,Helvetica,sans-serif} .title{text-align: left} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
@ -30,6 +31,10 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
|
||||
,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
|
||||
|
||||
@ -37,7 +42,7 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
dict(name=['object','link','embed'])
|
||||
,dict(name='div', attrs={'class':'read-more-link'})
|
||||
]
|
||||
|
||||
remove_attributes=['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/' )
|
||||
@ -90,3 +95,5 @@ class ArsTechnica2(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None).rpartition('?')[0]
|
||||
|
@ -5,76 +5,103 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
theatlantic.com
|
||||
'''
|
||||
import re
|
||||
import string
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||
|
||||
class TheAtlantic(BasicNewsRecipe):
|
||||
|
||||
title = 'The Atlantic'
|
||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||
description = 'Current affairs and politics focussed on the US'
|
||||
INDEX = 'http://www.theatlantic.com/doc/current'
|
||||
INDEX = 'http://www.theatlantic.com/magazine/toc/0/'
|
||||
language = 'en'
|
||||
|
||||
remove_tags_before = dict(name='div', id='storytop')
|
||||
remove_tags = [
|
||||
dict(name='div', id=['seealso','storybottom', 'footer', 'ad_banner_top', 'sidebar','articletoolstop','subcontent',]),
|
||||
dict(name='p', attrs={'id':["pagination"]}),
|
||||
dict(name='table',attrs={'class':"tools"}),
|
||||
dict(name='style'),
|
||||
dict(name='a', href='/a/newsletters.mhtml')
|
||||
]
|
||||
remove_attributes = ['icap', 'callout', 'style']
|
||||
remove_tags_before = dict(name='div', id='articleHead')
|
||||
remove_tags_after = dict(id='copyright')
|
||||
remove_tags = [dict(id=['header', 'printAds', 'pageControls'])]
|
||||
no_stylesheets = True
|
||||
conversion_options = { 'linearize_tables':True }
|
||||
|
||||
extra_css = '''
|
||||
#timestamp{font-family:Arial,Helvetica,sans-serif; color:#666666 ;font-size:x-small}
|
||||
#storytype{font-family:Arial,Helvetica,sans-serif; color:#D52B1E ;font-weight:bold; font-size:x-small}
|
||||
h2{font-family:georgia,serif; font-style:italic;font-size:x-small;font-weight:normal;}
|
||||
h1{font-family:georgia,serif; font-weight:bold; font-size:large}
|
||||
#byline{font-family:georgia,serif; font-weight:bold; font-size:x-small}
|
||||
#topgraf{font-family:Arial,Helvetica,sans-serif;font-size:x-small;font-weight:bold;}
|
||||
.artsans{{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
|
||||
'''
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/archive/', '/print/')
|
||||
|
||||
def parse_index(self):
|
||||
articles = []
|
||||
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
sectit = soup.find('h1', attrs={'class':'sectionTitle'})
|
||||
if sectit is not None:
|
||||
texts = sectit.findAll('cufontext')
|
||||
texts = map(self.tag_to_string, texts[-2:])
|
||||
self.timefmt = ' [%s]'%(''.join(texts))
|
||||
|
||||
issue = soup.find('span', attrs={'class':'issue'})
|
||||
if issue:
|
||||
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
|
||||
|
||||
cover = soup.find('img', alt=re.compile('Cover'), src=True)
|
||||
cover = soup.find('img', src=True, attrs={'class':'cover'})
|
||||
if cover is not None:
|
||||
self.cover_url = 'http://theatlantic.com'+cover['src']
|
||||
self.cover_url = cover['src']
|
||||
|
||||
for item in soup.findAll('div', attrs={'class':'item'}):
|
||||
a = item.find('a')
|
||||
if a and a.has_key('href'):
|
||||
feeds = []
|
||||
for section in soup.findAll('div', attrs={'class':'magazineSection'}):
|
||||
section_title = section.find(attrs={'class':'sectionHeader'})
|
||||
section_title = string.capwords(self.tag_to_string(section_title))
|
||||
self.log('Found section:', section_title)
|
||||
articles = []
|
||||
for post in section.findAll('div', attrs={'class':'post'}):
|
||||
h = post.find(['h3', 'h4'])
|
||||
title = self.tag_to_string(h)
|
||||
a = post.find('a', href=True)
|
||||
url = a['href']
|
||||
if not url.startswith('http://'):
|
||||
url = 'http://www.theatlantic.com/'+url
|
||||
url = url.replace('/doc/', '/doc/print/')
|
||||
title = self.tag_to_string(a)
|
||||
if title in ('VIDEO', 'AUDIO', 'INTERACTIVE MAP', 'SIDEBAR', 'RECIPES'):
|
||||
continue
|
||||
title = title.replace('&', '&')
|
||||
byline = item.find(attrs={'class':'byline'})
|
||||
date = self.tag_to_string(byline) if byline else ''
|
||||
description = ''
|
||||
if url.startswith('/'):
|
||||
url = 'http://www.theatlantic.com'+url
|
||||
p = post.find('p', attrs={'class':'dek'})
|
||||
desc = None
|
||||
self.log('\tFound article:', title, 'at', url)
|
||||
if p is not None:
|
||||
desc = self.tag_to_string(p)
|
||||
self.log('\t\t', desc)
|
||||
articles.append({'title':title, 'url':url, 'description':desc,
|
||||
'date':''})
|
||||
feeds.append((section_title, articles))
|
||||
|
||||
self.log('\tFound article:', title)
|
||||
self.log('\t\t', url)
|
||||
poems = []
|
||||
self.log('Found section: Poems')
|
||||
for poem in soup.findAll('div', attrs={'class':'poem'}):
|
||||
title = self.tag_to_string(poem.find('h4'))
|
||||
desc = self.tag_to_string(poem.find(attrs={'class':'author'}))
|
||||
url = 'http://www.theatlantic.com'+poem.find('a')['href']
|
||||
self.log('\tFound article:', title, 'at', url)
|
||||
self.log('\t\t', desc)
|
||||
poems.append({'title':title, 'url':url, 'description':desc,
|
||||
'date':''})
|
||||
if poems:
|
||||
feeds.append(('Poems', poems))
|
||||
|
||||
articles.append({
|
||||
'title':title,
|
||||
'date':date,
|
||||
'url':url,
|
||||
'description':description
|
||||
})
|
||||
self.log('Found section: Advice')
|
||||
div = soup.find(id='advice')
|
||||
title = self.tag_to_string(div.find('h4'))
|
||||
url = 'http://www.theatlantic.com'+div.find('a')['href']
|
||||
desc = self.tag_to_string(div.find('p'))
|
||||
self.log('\tFound article:', title, 'at', url)
|
||||
self.log('\t\t', desc)
|
||||
|
||||
feeds.append(('Advice', [{'title':title, 'url':url, 'description':desc,
|
||||
'date':''}]))
|
||||
return feeds
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for table in soup.findAll('table', align='right'):
|
||||
img = table.find('img')
|
||||
if img is not None:
|
||||
img.extract()
|
||||
caption = self.tag_to_string(table).strip()
|
||||
div = Tag(soup, 'div')
|
||||
div['style'] = 'text-align:center'
|
||||
div.insert(0, img)
|
||||
div.insert(1, Tag(soup, 'br'))
|
||||
if caption:
|
||||
div.insert(2, NavigableString(caption))
|
||||
table.replaceWith(div)
|
||||
|
||||
return soup
|
||||
|
||||
return [('Current Issue', articles)]
|
||||
|
@ -1,10 +1,10 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.azstarnet.com
|
||||
azstarnet.com
|
||||
'''
|
||||
|
||||
import urllib
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Azstarnet(BasicNewsRecipe):
|
||||
@ -14,12 +14,12 @@ class Azstarnet(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
publisher = 'azstarnet.com'
|
||||
category = 'news, politics, Arizona, USA'
|
||||
delay = 1
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
masthead_url = 'http://azstarnet.com/content/tncms/live/global/resources/images/logo.gif'
|
||||
needs_subscription = True
|
||||
|
||||
conversion_options = {
|
||||
@ -32,31 +32,27 @@ class Azstarnet(BasicNewsRecipe):
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
br.open('http://azstarnet.com/')
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://azstarnet.com/registration/retro.php')
|
||||
br.select_form(nr=1)
|
||||
br['email'] = self.username
|
||||
br['pass' ] = self.password
|
||||
br.submit()
|
||||
data = urllib.urlencode({ 'm':'login'
|
||||
,'u':self.username
|
||||
,'p':self.password
|
||||
,'z':'http://azstarnet.com/'
|
||||
})
|
||||
br.open('http://azstarnet.com/app/registration/proxy.php',data)
|
||||
return br
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'storycontent'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','iframe','base','img'])
|
||||
,dict(name='div',attrs={'class':'bannerinstory'})
|
||||
]
|
||||
remove_tags = [dict(name=['object','link','iframe','base','img'])]
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Tucson Region', u'http://rss.azstarnet.com/index.php?site=metro')
|
||||
,(u'Sports' , u'http://rss.azstarnet.com/index.php?site=sports')
|
||||
,(u'Business' , u'http://rss.azstarnet.com/index.php?site=biz-topheadlines')
|
||||
,(u'Nation-World' , u'http://rss.azstarnet.com/index.php?site=news')
|
||||
,(u'Opinion' , u'http://rss.azstarnet.com/index.php?site=opinion')
|
||||
,(u'Lifestyle' , u'http://rss.azstarnet.com/index.php?site=accent')
|
||||
,(u'Food' , u'http://rss.azstarnet.com/index.php?site=food')
|
||||
(u'Local News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/local&l=25&s=start_time&sd=desc')
|
||||
,(u'National News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/national&l=25&s=start_time&sd=desc')
|
||||
,(u'World News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/world&l=25&s=start_time&sd=desc')
|
||||
,(u'Sports' , u'http://azstarnet.com/search/?f=rss&t=article&c=sports&l=25&s=start_time&sd=desc')
|
||||
,(u'Opinion' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/opinion&l=25&s=start_time&sd=desc')
|
||||
,(u'Movies' , u'http://azstarnet.com/search/?f=rss&t=article&c=entertainment/movies&l=25&s=start_time&sd=desc')
|
||||
,(u'Food' , u'http://azstarnet.com/search/?f=rss&t=article&c=lifestyles/food-and-cooking&l=25&s=start_time&sd=desc')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
@ -64,4 +60,6 @@ class Azstarnet(BasicNewsRecipe):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?print=1'
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
b92.net
|
||||
'''
|
||||
@ -20,15 +19,14 @@ class B92(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1250'
|
||||
language = 'sr'
|
||||
|
||||
lang = 'sr-Latn-RS'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
|
||||
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
@ -50,20 +48,5 @@ class B92(BasicNewsRecipe):
|
||||
return url + '&version=print'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
del soup.body['onload']
|
||||
for item in soup.findAll('font'):
|
||||
item.name='div'
|
||||
if item.has_key('size'):
|
||||
del item['size']
|
||||
attribs = [ 'style','font','valign'
|
||||
,'colspan','width','height'
|
||||
,'rowspan','summary','align'
|
||||
,'cellspacing','cellpadding'
|
||||
,'frames','rules','border'
|
||||
]
|
||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||
item.name = 'div'
|
||||
for attrib in attribs:
|
||||
if item.has_key(attrib):
|
||||
del item[attrib]
|
||||
return soup
|
||||
return self.adeify_images(soup)
|
||||
|
||||
|
@ -1,13 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
beta.rs
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
class Danas(BasicNewsRecipe):
|
||||
title = 'BETA'
|
||||
@ -20,17 +18,13 @@ class Danas(BasicNewsRecipe):
|
||||
no_stylesheets = False
|
||||
use_embedded_content = True
|
||||
language = 'sr'
|
||||
|
||||
lang = 'sr-Latn-RS'
|
||||
direction = 'ltr'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
, 'pretty_print' : True
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
|
||||
@ -43,9 +37,4 @@ class Danas(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['lang'] = self.lang
|
||||
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
|
||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
||||
soup.head.insert(0,mlang)
|
||||
soup.head.insert(1,mcharset)
|
||||
return self.adeify_images(soup)
|
||||
|
@ -14,14 +14,13 @@ class Blic(BasicNewsRecipe):
|
||||
description = 'Blic.rs online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja'
|
||||
publisher = 'RINGIER d.o.o.'
|
||||
category = 'news, politics, Serbia'
|
||||
delay = 1
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
masthead_url = 'http://www.blic.rs/resources/images/header/header_back.png'
|
||||
language = 'sr'
|
||||
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} '
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Georgia, serif1, serif} .article_description{font-family: Arial, sans1, sans-serif} .img_full{float: none} img{margin-bottom: 0.8em} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -31,13 +30,15 @@ class Blic(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'id':'article_info'})
|
||||
remove_tags = [dict(name=['object','link'])]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [(u'Danasnje Vesti', u'http://www.blic.rs/rss/danasnje-vesti')]
|
||||
|
||||
remove_tags = [dict(name=['object','link'])]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '/print'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
36
resources/recipes/cetnixploitation.recipe
Normal file
36
resources/recipes/cetnixploitation.recipe
Normal file
@ -0,0 +1,36 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
chetnixploitation.blogspot.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Chetnixploitation(BasicNewsRecipe):
|
||||
title = 'Chetnixploitation'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Filmski blog'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'sr'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : 'film, blog, cetnici, srbija, ex-yu'
|
||||
, 'publisher': 'Son of Man'
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
feeds = [(u'Posts', u'http://chetnixploitation.blogspot.com/feeds/posts/default')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
||||
|
@ -20,7 +20,7 @@ class Danas(BasicNewsRecipe):
|
||||
encoding = 'utf-8'
|
||||
masthead_url = 'http://www.danas.rs/images/basic/danas.gif'
|
||||
language = 'sr'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif} .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif} .antrfileText{border-left: 2px solid #999999; color:#666666; margin-left: 0.8em; padding-left: 1.2em; margin-bottom: 0; margin-top: 0} h2,.datum,.lokacija,.autor{font-size: small} .antrfileNaslov{border-left: 2px solid #999999; color:#666666; margin-left: 0.8em; padding-left: 1.2em; font-weight:bold; margin-bottom: 0; margin-top: 0} img{margin-bottom: 0.8em} '
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif} .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif} .antrfileText{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em; margin-bottom: 0; margin-top: 0} h2,.datum,.lokacija,.autor{font-size: small} .antrfileNaslov{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em; font-weight:bold; margin-bottom: 0; margin-top: 0} img{margin-bottom: 0.8em} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
|
50
resources/recipes/diariovasco.recipe
Normal file
50
resources/recipes/diariovasco.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.diariovasco.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DiarioVasco(BasicNewsRecipe):
|
||||
title = 'Diario Vasco'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Noticias de pais Vasco y el resto del mundo'
|
||||
publisher = 'Diario Vasco'
|
||||
category = 'news, politics, Spain'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.diariovasco.com/img/rd.logotipo2_dvasco.gif'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'title'})
|
||||
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||
]
|
||||
remove_tags = [dict(name='ul')]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Ultimas Noticias' , u'http://www.diariovasco.com/rss/feeds/ultima.xml' )
|
||||
,(u'Portada' , u'http://www.diariovasco.com/portada.xml' )
|
||||
,(u'Politica' , u'http://www.diariovasco.com/rss/feeds/politica.xml' )
|
||||
,(u'Deportes' , u'http://www.diariovasco.com/rss/feeds/deportes.xml' )
|
||||
,(u'Economia' , u'http://www.diariovasco.com/rss/feeds/economia.xml' )
|
||||
,(u'Mundo' , u'http://www.diariovasco.com/rss/feeds/mundo.xml' )
|
||||
,(u'Cultura' , u'http://www.diariovasco.com/rss/feeds/cultura.xml' )
|
||||
,(u'Gente' , u'http://www.diariovasco.com/rss/feeds/gente.xml' )
|
||||
,(u'Contraportada' , u'http://www.diariovasco.com/rss/feeds/contraportada.xml')
|
||||
]
|
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
|
||||
'''
|
||||
e-novine.com
|
||||
@ -9,7 +7,6 @@ e-novine.com
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
class E_novine(BasicNewsRecipe):
|
||||
title = 'E-Novine'
|
||||
@ -20,40 +17,38 @@ class E_novine(BasicNewsRecipe):
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1250'
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
language = 'sr'
|
||||
|
||||
lang = 'sr'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
|
||||
masthead_url = 'http://www.e-novine.com/themes/e_novine/img/logo.gif'
|
||||
extra_css = ' @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,Helvetica,sans1,sans-serif} img{float: none; margin-bottom: 0.8em} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
, 'pretty_print' : True
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['css_47_0_2844H']})]
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'article_head'})
|
||||
,dict(name='div', attrs={'id':'article_body'})
|
||||
]
|
||||
|
||||
remove_tags = [dict(name=['object','link','embed','iframe'])]
|
||||
remove_tags = [
|
||||
dict(name=['object','link','embed','iframe'])
|
||||
,dict(attrs={'id':'box_article_tools'})
|
||||
]
|
||||
remove_attributes = ['height','width','lang']
|
||||
|
||||
feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )]
|
||||
feeds = [(u'Sve vesti', u'http://www.e-novine.com/feed/index.1.rss' )]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['xml:lang'] = self.lang
|
||||
soup.html['lang'] = self.lang
|
||||
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
|
||||
soup.head.insert(0,mlang)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
ftag = soup.find('div', attrs={'id':'css_47_0_2844H'})
|
||||
if ftag:
|
||||
it = ftag.div
|
||||
it.extract()
|
||||
ftag.div.extract()
|
||||
ftag.insert(0,it)
|
||||
return soup
|
||||
return self.adeify_images(soup)
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?print'
|
||||
|
@ -119,6 +119,8 @@ class Economist(BasicNewsRecipe):
|
||||
ns = NavigableString(self.tag_to_string(caption))
|
||||
div.insert(0, ns)
|
||||
div.insert(1, Tag(soup, 'br'))
|
||||
del img['width']
|
||||
del img['height']
|
||||
img.extract()
|
||||
div.insert(2, img)
|
||||
table.replaceWith(div)
|
||||
|
@ -123,6 +123,8 @@ class Economist(BasicNewsRecipe):
|
||||
div.insert(0, ns)
|
||||
div.insert(1, Tag(soup, 'br'))
|
||||
img.extract()
|
||||
del img['width']
|
||||
del img['height']
|
||||
div.insert(2, img)
|
||||
table.replaceWith(div)
|
||||
return soup
|
||||
|
58
resources/recipes/epicurious.recipe
Normal file
58
resources/recipes/epicurious.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Starson17'
|
||||
'''
|
||||
www.epicurious.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Epicurious(BasicNewsRecipe):
|
||||
title = u'Epicurious'
|
||||
__author__ = 'Starson17'
|
||||
description = 'Food and Recipes from Epicurious'
|
||||
cover_url = 'http://up6.podbean.com/image-logos/21849_logo.jpg'
|
||||
publisher = 'Epicurious'
|
||||
tags = 'news, food, gourmet, recipes'
|
||||
language = 'en'
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
recursions = 3
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 20
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['mainconsolewrapper','videoheader','content_unit','entry-content','see_more_block']}),
|
||||
dict(name='div', attrs={'id':['headline','introBlock','ingredients','preparation','articleContent','in_categories_block']})
|
||||
]
|
||||
|
||||
remove_tags = [{'id':['printShoppingList','addnoteLnk','btnUploadVideo','enlarge_image']},
|
||||
{'class':['subLnk','sbmWrapper','detail_division','entry-footer','comment-footer']},
|
||||
dict(name='div', attrs={'class':['tagged','comments']})
|
||||
]
|
||||
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'entry-content'})]
|
||||
|
||||
feeds = [
|
||||
(u'Recipes: Healthy dinner ', u'http://feeds.epicurious.com/healthy_recipes'),
|
||||
(u'New Recipes ', u'http://feeds.epicurious.com/newrecipes'),
|
||||
(u'Features ', u'http://feeds.epicurious.com/latestfeatures'),
|
||||
(u'Blogs ', u'http://feeds.feedburner.com/epicurious/epiblog')
|
||||
]
|
||||
|
||||
match_regexps = [
|
||||
r'http://www.epicurious.com/.*recipes/.*/views'
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'/\n', re.DOTALL|re.IGNORECASE), lambda match: '/'),
|
||||
(re.compile(r'_116.jpg', re.DOTALL|re.IGNORECASE), lambda match: '.jpg'),
|
||||
(re.compile('<div class=\"comments\".*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')
|
||||
]
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for t in soup.findAll(['table', 'tr', 'td']):
|
||||
t.name = 'div'
|
||||
return soup
|
||||
|
@ -1,27 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010 Starson17'
|
||||
'''
|
||||
fudzilla.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Fudzilla(BasicNewsRecipe):
|
||||
title = u'Fudzilla'
|
||||
__author__ = 'Darko Miletic'
|
||||
__author__ = 'Starson17'
|
||||
language = 'en'
|
||||
|
||||
description = 'Tech news'
|
||||
oldest_article = 7
|
||||
remove_javascript = True
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
|
||||
feeds = [ (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')]
|
||||
|
||||
def print_version(self, url):
|
||||
nurl = url.replace('http://www.fudzilla.com/index.php','http://www.fudzilla.com/index2.php')
|
||||
nmain, nsep, nrest = nurl.partition('&Itemid=')
|
||||
return nmain + '&pop=1&page=0&Itemid=1'
|
||||
remove_tags_before = dict(name='div', attrs={'class':['padding']})
|
||||
|
||||
remove_tags = [dict(name='td', attrs={'class':['left','right']}),
|
||||
dict(name='div', attrs={'id':['toolbar','buttons']}),
|
||||
dict(name='div', attrs={'class':['artbannersxtd','back_button']}),
|
||||
dict(name='span', attrs={'class':['pathway']}),
|
||||
dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}),
|
||||
dict(name='table', attrs={'class':['headlines']}),
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1')
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<p class="MsoNormal"> Welcome.*</p> ', re.DOTALL|re.IGNORECASE), lambda match: '')
|
||||
]
|
||||
|
56
resources/recipes/gamasutra_fa.recipe
Normal file
56
resources/recipes/gamasutra_fa.recipe
Normal file
@ -0,0 +1,56 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
gamasutra.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Gamasutra(BasicNewsRecipe):
|
||||
title = 'Gamasutra Featured articles'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'The Art and Business of Making Games'
|
||||
publisher = 'Gamasutra'
|
||||
category = 'news, games, IT'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.gamasutra.com/images/gamasutra_logo.gif'
|
||||
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .title{font-size: x-large; font-weight: bold} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE),lambda match: '<head><title>')
|
||||
,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
|
||||
,(re.compile(r'</head>', re.DOTALL|re.IGNORECASE),lambda match: '</head><body>')
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name=['object','embed','iframe'])
|
||||
,dict(attrs={'class':'adBox'})
|
||||
]
|
||||
remove_tags_before = dict(attrs={'class':'title'})
|
||||
remove_attributes = ['width','height','name']
|
||||
|
||||
feeds = [(u'Feature Articles', u'http://feeds.feedburner.com/GamasutraFeatureArticles')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?print=1'
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
45
resources/recipes/gamasutra_news.recipe
Normal file
45
resources/recipes/gamasutra_news.recipe
Normal file
@ -0,0 +1,45 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
gamasutra.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Gamasutra(BasicNewsRecipe):
|
||||
title = 'Gamasutra News'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'The Art and Business of Making Games'
|
||||
publisher = 'Gamasutra'
|
||||
category = 'news, games, IT'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.gamasutra.com/images/gamasutra_logo.gif'
|
||||
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .newsTitle{font-size: xx-large; font-weight: bold} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
remove_tags = [dict(attrs={'class':['relatedNews','adBox']})]
|
||||
keep_only_tags = [dict(attrs={'class':['newsTitle','newsAuth','newsDate','newsText']})]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [(u'News', u'http://feeds.feedburner.com/GamasutraNews')]
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
|
||||
'''
|
||||
glassrpske.com
|
||||
@ -9,7 +8,6 @@ glassrpske.com
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
class GlasSrpske(BasicNewsRecipe):
|
||||
title = 'Glas Srpske'
|
||||
@ -22,20 +20,16 @@ class GlasSrpske(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
cover_url = 'http://www.glassrpske.com/var/slike/glassrpske-logo.png'
|
||||
lang = 'sr-BA'
|
||||
masthead_url = 'http://www.glassrpske.com/var/slike/glassrpske-logo.png'
|
||||
language = 'sr'
|
||||
|
||||
INDEX = 'http://www.glassrpske.com'
|
||||
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} img{margin-bottom: 0.8em} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
, 'pretty_print' : True
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
@ -63,11 +57,7 @@ class GlasSrpske(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['xml:lang'] = self.lang
|
||||
soup.html['lang'] = self.lang
|
||||
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
|
||||
soup.head.insert(0,mlang)
|
||||
return soup
|
||||
return self.adeify_images(soup)
|
||||
|
||||
def parse_index(self):
|
||||
totalfeeds = []
|
||||
|
@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.glas-javnosti.rs
|
||||
'''
|
||||
@ -19,17 +18,13 @@ class GlasJavnosti(BasicNewsRecipe):
|
||||
no_stylesheets = False
|
||||
use_embedded_content = False
|
||||
language = 'sr'
|
||||
|
||||
lang = 'sr-Latn-RS'
|
||||
direction = 'ltr'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
, 'pretty_print' : True
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
|
||||
|
@ -13,8 +13,6 @@ class heiseDe(BasicNewsRecipe):
|
||||
title = 'heise'
|
||||
description = 'Computernews from Germany'
|
||||
__author__ = 'Oliver Niesner'
|
||||
language = 'de'
|
||||
|
||||
use_embedded_content = False
|
||||
timefmt = ' [%d %b %Y]'
|
||||
max_articles_per_feed = 40
|
||||
@ -35,12 +33,10 @@ class heiseDe(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'bcadv ISI_IGNORE'}),
|
||||
dict(name='p', attrs={'class':'news_option'}),
|
||||
dict(name='p', attrs={'class':'news_navi'}),
|
||||
dict(name='p', attrs={'class':'news_foren'})]
|
||||
remove_tags_after = [dict(name='p', attrs={'class':'news_foren'})]
|
||||
dict(name='div', attrs={'class':'news_foren'})]
|
||||
remove_tags_after = [dict(name='div', attrs={'class':'news_foren'})]
|
||||
|
||||
feeds = [ ('heise', 'http://www.heise.de/newsticker/heise.rdf') ]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,17 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class HoustonChronicle(BasicNewsRecipe):
|
||||
|
||||
title = u'The Houston Chronicle'
|
||||
description = 'News from Houston, Texas'
|
||||
__author__ = 'Kovid Goyal'
|
||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||
language = 'en'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
no_stylesheets = True
|
||||
|
||||
keep_only_tags = [dict(id=['story-head', 'story'])]
|
||||
remove_tags = [dict(id=['share-module', 'resource-box',
|
||||
'resource-box-header'])]
|
||||
keep_only_tags = [
|
||||
dict(id=['story-head', 'story'])
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(id=['share-module', 'resource-box',
|
||||
'resource-box-header'])
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
|
||||
h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;}
|
||||
h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;}
|
||||
h4{font-family :Arial,Helvetica,sans-serif; font-size: x-small;}
|
||||
p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
|
||||
#story-head h1{font-family :Arial,Helvetica,sans-serif; font-size: xx-large;}
|
||||
#story-head h2{font-family :Arial,Helvetica,sans-serif; font-size: small; color:#000000;}
|
||||
#story-head h3{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
|
||||
#story-head h4{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
|
||||
#story{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
#Text-TextSubhed BoldCond PoynterAgateZero h3{color:#444444;font-family :Arial,Helvetica,sans-serif; font-size:small;}
|
||||
.p260x p{font-family :Arial,Helvetica,serif; font-size:x-small;font-style:italic;}
|
||||
.p260x h6{color:#777777;font-family :Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
'''
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.chron.com/news/')
|
||||
@ -64,3 +88,6 @@ class HoustonChronicle(BasicNewsRecipe):
|
||||
feeds.append((current_section, current_articles))
|
||||
return feeds
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -3,7 +3,7 @@ import re
|
||||
|
||||
class HuffingtonPostRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'kwetal'
|
||||
__author__ = 'kwetal and Archana Raman'
|
||||
language = 'en'
|
||||
version = 2
|
||||
|
||||
@ -14,70 +14,89 @@ class HuffingtonPostRecipe(BasicNewsRecipe):
|
||||
|
||||
oldest_article = 1.1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = True
|
||||
#use_embedded_content = True
|
||||
|
||||
encoding = 'utf-8'
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
# Feeds from: http://www.huffingtonpost.com/syndication/
|
||||
feeds = []
|
||||
feeds.append((u'Latest News', u'http://feeds.huffingtonpost.com/huffingtonpost/LatestNews'))
|
||||
|
||||
#feeds.append((u'Politics', u'http://www.huffingtonpost.com/feeds/verticals/politics/index.xml'))
|
||||
feeds.append((u'Politics: News', u'http://www.huffingtonpost.com/feeds/verticals/politics/news.xml'))
|
||||
feeds.append((u'Politics: Blog', u'http://www.huffingtonpost.com/feeds/verticals/politics/blog.xml'))
|
||||
feeds.append((u'Politics', u'http://www.huffingtonpost.com/feeds/verticals/politics/index.xml'))
|
||||
#feeds.append((u'Politics: News', u'http://www.huffingtonpost.com/feeds/verticals/politics/news.xml'))
|
||||
#feeds.append((u'Politics: Blog', u'http://www.huffingtonpost.com/feeds/verticals/politics/blog.xml'))
|
||||
|
||||
#feeds.append((u'Media', u'http://www.huffingtonpost.com/feeds/verticals/media/index.xml'))
|
||||
feeds.append((u'Media: News', u'http://www.huffingtonpost.com/feeds/verticals/media/news.xml'))
|
||||
feeds.append((u'Media: Blog', u'http://www.huffingtonpost.com/feeds/verticals/media/blog.xml'))
|
||||
feeds.append((u'Media', u'http://www.huffingtonpost.com/feeds/verticals/media/index.xml'))
|
||||
#feeds.append((u'Media: News', u'http://www.huffingtonpost.com/feeds/verticals/media/news.xml'))
|
||||
#feeds.append((u'Media: Blog', u'http://www.huffingtonpost.com/feeds/verticals/media/blog.xml'))
|
||||
|
||||
#feeds.append((u'Business', u'http://www.huffingtonpost.com/feeds/verticals/business/index.xml'))
|
||||
feeds.append((u'Business: News', u'http://www.huffingtonpost.com/feeds/verticals/business/news.xml'))
|
||||
feeds.append((u'Business: Blogs', u'http://www.huffingtonpost.com/feeds/verticals/business/blog.xml'))
|
||||
feeds.append((u'Business', u'http://www.huffingtonpost.com/feeds/verticals/business/index.xml'))
|
||||
#feeds.append((u'Business: News', u'http://www.huffingtonpost.com/feeds/verticals/business/news.xml'))
|
||||
#feeds.append((u'Business: Blogs', u'http://www.huffingtonpost.com/feeds/verticals/business/blog.xml'))
|
||||
|
||||
#feeds.append((u'Entertainment', u'http://www.huffingtonpost.com/feeds/verticals/entertainment/index.xml'))
|
||||
feeds.append((u'Entertainment: News', u'http://www.huffingtonpost.com/feeds/verticals/business/news.xml'))
|
||||
feeds.append((u'Entertainment: Blog', u'http://www.huffingtonpost.com/feeds/verticals/entertainment/blog.xml'))
|
||||
feeds.append((u'Entertainment', u'http://www.huffingtonpost.com/feeds/verticals/entertainment/index.xml'))
|
||||
#feeds.append((u'Entertainment: News', u'http://www.huffingtonpost.com/feeds/verticals/business/news.xml'))
|
||||
#feeds.append((u'Entertainment: Blog', u'http://www.huffingtonpost.com/feeds/verticals/entertainment/blog.xml'))
|
||||
|
||||
#feeds.append((u'Living', u'http://www.huffingtonpost.com/feeds/verticals/living/index.xml'))
|
||||
feeds.append((u'Living: News', u'http://www.huffingtonpost.com/feeds/verticals/living/news.xml'))
|
||||
feeds.append((u'Living: Blog', u'http://www.huffingtonpost.com/feeds/verticals/living/blog.xml'))
|
||||
feeds.append((u'Living', u'http://www.huffingtonpost.com/feeds/verticals/living/index.xml'))
|
||||
#feeds.append((u'Living: News', u'http://www.huffingtonpost.com/feeds/verticals/living/news.xml'))
|
||||
#feeds.append((u'Living: Blog', u'http://www.huffingtonpost.com/feeds/verticals/living/blog.xml'))
|
||||
|
||||
#feeds.append((u'Style', u'http://www.huffingtonpost.com/feeds/verticals/style/index.xml'))
|
||||
feeds.append((u'Style: News', u'http://www.huffingtonpost.com/feeds/verticals/style/news.xml'))
|
||||
feeds.append((u'Style: Blog', u'http://www.huffingtonpost.com/feeds/verticals/style/blog.xml'))
|
||||
feeds.append((u'Style', u'http://www.huffingtonpost.com/feeds/verticals/style/index.xml'))
|
||||
#feeds.append((u'Style: News', u'http://www.huffingtonpost.com/feeds/verticals/style/news.xml'))
|
||||
#feeds.append((u'Style: Blog', u'http://www.huffingtonpost.com/feeds/verticals/style/blog.xml'))
|
||||
|
||||
#feeds.append((u'Green', u'http://www.huffingtonpost.com/feeds/verticals/green/index.xml'))
|
||||
feeds.append((u'Green: News', u'http://www.huffingtonpost.com/feeds/verticals/green/news.xml'))
|
||||
feeds.append((u'Green: Blog', u'http://www.huffingtonpost.com/feeds/verticals/green/blog.xml'))
|
||||
feeds.append((u'Green', u'http://www.huffingtonpost.com/feeds/verticals/green/index.xml'))
|
||||
#feeds.append((u'Green: News', u'http://www.huffingtonpost.com/feeds/verticals/green/news.xml'))
|
||||
#feeds.append((u'Green: Blog', u'http://www.huffingtonpost.com/feeds/verticals/green/blog.xml'))
|
||||
|
||||
#feeds.append((u'Technology', u'http://www.huffingtonpost.com/feeds/verticals/technology/index.xml'))
|
||||
feeds.append((u'Technology: News', u'http://www.huffingtonpost.com/feeds/verticals/technology/news.xml'))
|
||||
feeds.append((u'Technology: Blog', u'http://www.huffingtonpost.com/feeds/verticals/technology/blog.xml'))
|
||||
feeds.append((u'Technology', u'http://www.huffingtonpost.com/feeds/verticals/technology/index.xml'))
|
||||
#feeds.append((u'Technology: News', u'http://www.huffingtonpost.com/feeds/verticals/technology/news.xml'))
|
||||
#feeds.append((u'Technology: Blog', u'http://www.huffingtonpost.com/feeds/verticals/technology/blog.xml'))
|
||||
|
||||
#feeds.append((u'Comedy', u'http://www.huffingtonpost.com/feeds/verticals/comedy/index.xml'))
|
||||
feeds.append((u'Comedy: News', u'http://www.huffingtonpost.com/feeds/verticals/comedy/news.xml'))
|
||||
feeds.append((u'Comedy: Blog', u'http://www.huffingtonpost.com/feeds/verticals/comedy/blog.xml'))
|
||||
feeds.append((u'Comedy', u'http://www.huffingtonpost.com/feeds/verticals/comedy/index.xml'))
|
||||
#feeds.append((u'Comedy: News', u'http://www.huffingtonpost.com/feeds/verticals/comedy/news.xml'))
|
||||
#feeds.append((u'Comedy: Blog', u'http://www.huffingtonpost.com/feeds/verticals/comedy/blog.xml'))
|
||||
|
||||
#feeds.append((u'World', u'http://www.huffingtonpost.com/feeds/verticals/world/index.xml'))
|
||||
feeds.append((u'World: News', u'http://www.huffingtonpost.com/feeds/verticals/world/news.xml'))
|
||||
feeds.append((u'World: Blog', u'http://www.huffingtonpost.com/feeds/verticals/world/blog.xml'))
|
||||
feeds.append((u'World', u'http://www.huffingtonpost.com/feeds/verticals/world/index.xml'))
|
||||
#feeds.append((u'World: News', u'http://www.huffingtonpost.com/feeds/verticals/world/news.xml'))
|
||||
#feeds.append((u'World: Blog', u'http://www.huffingtonpost.com/feeds/verticals/world/blog.xml'))
|
||||
|
||||
feeds.append((u'Original Reporting', u'http://www.huffingtonpost.com/tag/huffpolitics/feed'))
|
||||
feeds.append((u'Original Posts', u'http://www.huffingtonpost.com/feeds/original_posts/index.xml'))
|
||||
#feeds.append((u'Original Posts', u'http://www.huffingtonpost.com/feeds/original_posts/index.xml'))
|
||||
|
||||
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name='a', attrs={'href' : re.compile('http://feedads\.g\.doubleclick.net.*')}))
|
||||
remove_tags.append(dict(name='div', attrs={'class' : 'feedflare'}))
|
||||
remove_tags.append(dict(name='a', attrs={'class' : 'home_pixie'}))
|
||||
remove_tags.append(dict(name='div', attrs={'id' : ["top_nav",'threeup_top_wrapper','breaking_news_container',"hp_social_network"]}))
|
||||
remove_tags.append(dict(name='img', alt="Connect"))
|
||||
remove_tags.append(dict(name='div', attrs={'class' : ['logo']})) #'share_boxes_box_block_b_wraper',
|
||||
remove_tags.append(dict(name='div', attrs={'class' :[ 'read_more with_verticals','chicklets_box_outter_v05','blogger_menu_content','chicklets_bar']}))
|
||||
remove_tags.append(dict(name='div', attrs={'class' : ['sidebar_blog_first_design','sidebar_blog_second_design',]}))
|
||||
remove_tags.append(dict(name='div', attrs={'class' : ['main_big_news_ontop','login-menu','sidebar_blog_third_design','read_more']}))
|
||||
|
||||
|
||||
remove_tags_after = [dict(name='div', attrs={'class' : 'entry_content'}) ]
|
||||
# remove_attributes = ['style']
|
||||
|
||||
remove_attributes = ['style']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
|
||||
h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;}
|
||||
h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;}
|
||||
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
|
||||
h2{font-size: x-large; font-weight: bold; padding: 0em; margin-bottom: 0.2em;}
|
||||
a[href]{color: blue; text-decoration: none; cursor: pointer;}
|
||||
#title_permalink{color:black;font-size:large;}
|
||||
.date{color:#858585;font-family:"Times New Roman",sans-serif;}
|
||||
.comments_datetime v05{color:#696969;}
|
||||
.teaser_permalink{font-style:italic;font-size:xx-small;}
|
||||
.blog_posted_date{color:#696969;font-size:xx-small;font-weight: bold;}
|
||||
'''
|
||||
|
||||
#a[href]{color: blue; text-decoration: none; cursor: pointer;}
|
||||
def get_article_url(self, article):
|
||||
"""
|
||||
Workaround for Feedparser behaviour. If an item has more than one <link/> element, article.link is empty and
|
||||
@ -85,10 +104,21 @@ class HuffingtonPostRecipe(BasicNewsRecipe):
|
||||
Todo: refactor to searching this list to avoid the hardcoded zero-index
|
||||
"""
|
||||
link = article.get('link')
|
||||
print("Link:"+link)
|
||||
if not link:
|
||||
links = article.get('links')
|
||||
if links:
|
||||
link = links[0]['href']
|
||||
if not links[0]['href']:
|
||||
link = links[1]['href']
|
||||
|
||||
return link
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for tag in soup.findAll('div',text = "What's Your Reaction?"):
|
||||
tag.extract()
|
||||
|
||||
for tg in soup.findAll('blockquote'):
|
||||
tg.extract()
|
||||
|
||||
return soup
|
||||
|
87
resources/recipes/johm.recipe
Normal file
87
resources/recipes/johm.recipe
Normal file
@ -0,0 +1,87 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class JournalofHospitalMedicine(BasicNewsRecipe):
|
||||
|
||||
title = 'Journal of Hospital Medicine'
|
||||
__author__ = 'Krittika Goyal'
|
||||
description = 'Medical news'
|
||||
timefmt = ' [%d %b, %Y]'
|
||||
needs_subscription = True
|
||||
|
||||
no_stylesheets = True
|
||||
#remove_tags_before = dict(name='div', attrs={'align':'center'})
|
||||
#remove_tags_after = dict(name='ol', attrs={'compact':'COMPACT'})
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
dict(name='div', attrs={'class':'subContent'}),
|
||||
dict(name='div', attrs={'id':['contentFrame']}),
|
||||
#dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or author')"}),
|
||||
#dict(name='table', attrs={'align':'RIGHT'}),
|
||||
]
|
||||
|
||||
|
||||
|
||||
# TO LOGIN
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
br.open('http://www3.interscience.wiley.com/cgi-bin/home')
|
||||
br.select_form(name='siteLogin')
|
||||
br['LoginName'] = self.username
|
||||
br['Password'] = self.password
|
||||
response = br.submit()
|
||||
raw = response.read()
|
||||
if 'userName = ""' in raw:
|
||||
raise Exception('Login failed. Check your username and password')
|
||||
return br
|
||||
|
||||
#TO GET ARTICLE TOC
|
||||
def johm_get_index(self):
|
||||
return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home')
|
||||
|
||||
# To parse artice toc
|
||||
def parse_index(self):
|
||||
parse_soup = self.johm_get_index()
|
||||
|
||||
div = parse_soup.find(id='contentCell')
|
||||
|
||||
current_section = None
|
||||
current_articles = []
|
||||
feeds = []
|
||||
for x in div.findAll(True):
|
||||
if x.name == 'h4':
|
||||
# Section heading found
|
||||
if current_articles and current_section:
|
||||
feeds.append((current_section, current_articles))
|
||||
current_section = self.tag_to_string(x)
|
||||
current_articles = []
|
||||
self.log('\tFound section:', current_section)
|
||||
if current_section is not None and x.name == 'strong':
|
||||
title = self.tag_to_string(x)
|
||||
p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x)
|
||||
if p is None:
|
||||
continue
|
||||
url = p.get('href', False)
|
||||
if not url or not title:
|
||||
continue
|
||||
if url.startswith('/'):
|
||||
url = 'http://www3.interscience.wiley.com'+url
|
||||
url = url.replace('/HTMLSTART', '/main.html,ftx_abs')
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
#if url.startswith('/'):
|
||||
#url = 'http://online.wsj.com'+url
|
||||
current_articles.append({'title': title, 'url':url,
|
||||
'description':'', 'date':''})
|
||||
|
||||
if current_articles and current_section:
|
||||
feeds.append((current_section, current_articles))
|
||||
|
||||
return feeds
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for img in soup.findAll('img', src=True):
|
||||
img['src'] = img['src'].replace('tfig', 'nfig')
|
||||
return soup
|
||||
|
37
resources/recipes/kukuburi.recipe
Normal file
37
resources/recipes/kukuburi.recipe
Normal file
@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Mori'
|
||||
__version__ = 'v. 0.1'
|
||||
'''
|
||||
Kukuburi.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class KukuburiRecipe(BasicNewsRecipe):
|
||||
__author__ = 'Mori'
|
||||
language = 'en'
|
||||
|
||||
title = u'Kukuburi'
|
||||
publisher = u'Ramón Pérez'
|
||||
description =u'KUKUBURI by Ram\xc3\xb3n P\xc3\xa9rez'
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
oldest_article = 100
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [
|
||||
(u'Kukuburi', u'http://feeds2.feedburner.com/Kukuburi')
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<!--.*?-->', lambda match: ''),
|
||||
(r'<div class="feedflare".*?</div>', lambda match: '')
|
||||
]
|
||||
]
|
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
lasegunda.com
|
||||
'''
|
||||
@ -19,43 +17,38 @@ class LaSegunda(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
cover_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
|
||||
remove_javascript = True
|
||||
masthead_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
|
||||
remove_empty_feeds = True
|
||||
language = 'es'
|
||||
extra_css = ' .titulonegritastop{font-size: xx-large; font-weight: bold} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
, '--ignore-tables'
|
||||
]
|
||||
remove_tags_before = dict(attrs={'class':'titulonegritastop'})
|
||||
remove_tags = [dict(name='img')]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} "'
|
||||
|
||||
keep_only_tags = [dict(name='table')]
|
||||
|
||||
feeds = [
|
||||
(u'Noticias de ultima hora', u'http://www.lasegunda.com/rss20/index.asp?canal=0')
|
||||
,(u'Politica', u'http://www.lasegunda.com/rss20/index.asp?canal=21')
|
||||
,(u'Cronica', u'http://www.lasegunda.com/rss20/index.asp?canal=20')
|
||||
,(u'Internacional', u'http://www.lasegunda.com/rss20/index.asp?canal=23')
|
||||
,(u'Deportes', u'http://www.lasegunda.com/rss20/index.asp?canal=24')
|
||||
,(u'Epectaculos/Cultura', u'http://www.lasegunda.com/rss20/index.asp?canal=25')
|
||||
,(u'Educacion', u'http://www.lasegunda.com/rss20/index.asp?canal=26')
|
||||
,(u'Ciencia y Tecnologia', u'http://www.lasegunda.com/rss20/index.asp?canal=27')
|
||||
,(u'Solidaridad', u'http://www.lasegunda.com/rss20/index.asp?canal=28')
|
||||
,(u'Buena Vida', u'http://www.lasegunda.com/rss20/index.asp?canal=32')
|
||||
,(u'Politica' , u'http://www.lasegunda.com/rss20/index.asp?canal=21')
|
||||
,(u'Cronica' , u'http://www.lasegunda.com/rss20/index.asp?canal=20')
|
||||
,(u'Internacional' , u'http://www.lasegunda.com/rss20/index.asp?canal=23')
|
||||
,(u'Deportes' , u'http://www.lasegunda.com/rss20/index.asp?canal=24')
|
||||
,(u'Epectaculos/Cultura' , u'http://www.lasegunda.com/rss20/index.asp?canal=25')
|
||||
,(u'Educacion' , u'http://www.lasegunda.com/rss20/index.asp?canal=26')
|
||||
,(u'Ciencia y Tecnologia' , u'http://www.lasegunda.com/rss20/index.asp?canal=27')
|
||||
,(u'Solidaridad' , u'http://www.lasegunda.com/rss20/index.asp?canal=28')
|
||||
,(u'Buena Vida' , u'http://www.lasegunda.com/rss20/index.asp?canal=32')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
rest, sep, article_id = url.partition('index.asp?idnoticia=')
|
||||
return u'http://www.lasegunda.com/edicionOnline/include/secciones/_detalle_impresion.asp?idnoticia=' + article_id
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
|
||||
soup.head.insert(0,mtag)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
latercera.com
|
||||
'''
|
||||
@ -18,32 +16,32 @@ class LaTercera(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
language = 'es'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['span-16 articulo border','span-16 border','span-16']}) ]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='script')
|
||||
,dict(name='ul')
|
||||
dict(name=['ul','input','base'])
|
||||
,dict(name='div', attrs={'id':['boxComentarios','shim','enviarAmigo']})
|
||||
,dict(name='div', attrs={'class':['ad640','span-10 imgSet A','infoRelCol']})
|
||||
,dict(name='input')
|
||||
,dict(name='p', attrs={'id':['mensajeError','mensajeEnviandoNoticia','mensajeExito']})
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Noticias de ultima hora', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&ul=1')
|
||||
,(u'Pais', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=654')
|
||||
,(u'Nacional', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=680')
|
||||
,(u'Politica', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=674')
|
||||
,(u'Mundo', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=678')
|
||||
,(u'Deportes', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=656')
|
||||
,(u'Negocios', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=655')
|
||||
@ -55,10 +53,6 @@ class LaTercera(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
|
||||
soup.head.insert(0,mtag)
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
language = 'es'
|
||||
|
@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2009, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
newscientist.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NewScientist(BasicNewsRecipe):
|
||||
@ -15,12 +15,14 @@ class NewScientist(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
publisher = 'New Scientist'
|
||||
category = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
|
||||
delay = 3
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
cover_url = 'http://www.newscientist.com/currentcover.jpg'
|
||||
masthead_url = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
|
||||
encoding = 'utf-8'
|
||||
extra_css = ' body{font-family: Arial,sans-serif} img{margin-bottom: 0.8em} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -28,14 +30,18 @@ class NewScientist(BasicNewsRecipe):
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol']})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['hldBd','adline','pnl','infotext' ]})
|
||||
,dict(name='div', attrs={'id' :['compnl','artIssueInfo','artTools']})
|
||||
dict(name='div' , attrs={'class':['hldBd','adline','pnl','infotext' ]})
|
||||
,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools']})
|
||||
,dict(name='p' , attrs={'class':['marker','infotext' ]})
|
||||
,dict(name='meta' , attrs={'name' :'description' })
|
||||
]
|
||||
remove_tags_after = dict(attrs={'class':'nbpcopy'})
|
||||
remove_attributes = ['height','width']
|
||||
|
||||
feeds = [
|
||||
(u'Latest Headlines' , u'http://feeds.newscientist.com/science-news' )
|
||||
@ -50,9 +56,15 @@ class NewScientist(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
url = article.get('guid', None)
|
||||
return url
|
||||
return article.get('guid', None)
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?full=true&print=true'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for tg in soup.findAll('a'):
|
||||
if tg.string == 'Home':
|
||||
tg.parent.extract()
|
||||
return self.adeify_images(soup)
|
||||
return self.adeify_images(soup)
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
nspm.rs
|
||||
'''
|
||||
@ -22,16 +20,15 @@ class Nspm(BasicNewsRecipe):
|
||||
INDEX = 'http://www.nspm.rs/?alphabet=l'
|
||||
encoding = 'utf-8'
|
||||
language = 'sr'
|
||||
|
||||
lang = 'sr-Latn-RS'
|
||||
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
|
||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
, 'pretty_print' : True
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
@ -39,6 +36,8 @@ class Nspm(BasicNewsRecipe):
|
||||
dict(name=['link','object','embed'])
|
||||
,dict(name='td', attrs={'class':'buttonheading'})
|
||||
]
|
||||
remove_tags_after = dict(attrs={'class':'article_separator'})
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
@ -51,17 +50,6 @@ class Nspm(BasicNewsRecipe):
|
||||
return url.replace('.html','/stampa.html')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['xml:lang'] = self.lang
|
||||
soup.html['lang'] = self.lang
|
||||
attribs = [ 'style','font','valign'
|
||||
,'colspan','width','height'
|
||||
,'rowspan','summary','align'
|
||||
,'cellspacing','cellpadding'
|
||||
,'frames','rules','border'
|
||||
]
|
||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||
item.name = 'div'
|
||||
for attrib in attribs:
|
||||
if item.has_key(attrib):
|
||||
del item[attrib]
|
||||
for item in soup.body.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
||||
|
@ -7,10 +7,11 @@ sfgate.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class SanFranciscoChronicle(BasicNewsRecipe):
|
||||
title = u'San Francisco Chronicle'
|
||||
__author__ = u'Darko Miletic'
|
||||
__author__ = u'Darko Miletic and Sujata Raman'
|
||||
description = u'San Francisco news'
|
||||
language = 'en'
|
||||
|
||||
@ -19,13 +20,56 @@ class SanFranciscoChronicle(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
|
||||
remove_tags_before = {'class':'articleheadings'}
|
||||
remove_tags_after = dict(name='div', attrs={'id':'articlecontent' })
|
||||
|
||||
|
||||
remove_tags_before = {'id':'printheader'}
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'tools tools_top'})
|
||||
,dict(name='div', attrs={'id':'articlebox' })
|
||||
dict(name='div',attrs={'id':'printheader'})
|
||||
,dict(name='a', attrs={'href':re.compile('http://ads\.pheedo\.com.*')})
|
||||
,dict(name='div',attrs={'id':'footer'})
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
|
||||
h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;}
|
||||
h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;}
|
||||
h4{font-family :Arial,Helvetica,sans-serif; font-size: x-small;}
|
||||
p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
|
||||
.byline{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
|
||||
.date{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
|
||||
.dtlcomment{font-style:italic;}
|
||||
.georgia h3{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#000000;}
|
||||
'''
|
||||
|
||||
feeds = [
|
||||
(u'Top News Stories', u'http://www.sfgate.com/rss/feeds/news.xml')
|
||||
]
|
||||
|
||||
def print_version(self,url):
|
||||
url= url +"&type=printable"
|
||||
return url
|
||||
|
||||
def get_article_url(self, article):
|
||||
print str(article['title_detail']['value'])
|
||||
url = article.get('guid',None)
|
||||
url = "http://www.sfgate.com/cgi-bin/article.cgi?f="+url
|
||||
if "Presented By:" in str(article['title_detail']['value']):
|
||||
url = ''
|
||||
return url
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
42
resources/recipes/sfbg.recipe
Normal file
42
resources/recipes/sfbg.recipe
Normal file
@ -0,0 +1,42 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class SanFranciscoBayGuardian(BasicNewsRecipe):
|
||||
title = u'San Francisco Bay Guardian'
|
||||
language = 'en'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
#encoding = 'latin1'
|
||||
|
||||
no_stylesheets = True
|
||||
remove_tags_before = dict(name='div', attrs={'id':'story_header'})
|
||||
remove_tags_after = dict(name='div', attrs={'id':'shirttail'})
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
#dict(name='div', attrs={'class':'related-articles'}),
|
||||
dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}),
|
||||
#dict(name='ul', attrs={'class':'article-tools'}),
|
||||
dict(name='ul', attrs={'id':'story_tabs'}),
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
('Cover', 'http://www.newsobserver.com/100/index.rss'),
|
||||
('News', 'http://www.newsobserver.com/102/index.rss'),
|
||||
('Politics', 'http://www.newsobserver.com/105/index.rss'),
|
||||
('Business', 'http://www.newsobserver.com/104/index.rss'),
|
||||
('Sports', 'http://www.newsobserver.com/103/index.rss'),
|
||||
('College Sports', 'http://www.newsobserver.com/119/index.rss'),
|
||||
('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
|
||||
('Editorials', 'http://www.newsobserver.com/158/index.rss')]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
story = soup.find(name='div', attrs={'id':'story_body'})
|
||||
#td = heading.findParent(name='td')
|
||||
#td.extract()
|
||||
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
||||
body = soup.find(name='body')
|
||||
body.insert(0, story)
|
||||
return soup
|
52
resources/recipes/smith.recipe
Normal file
52
resources/recipes/smith.recipe
Normal file
@ -0,0 +1,52 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class SmithsonianMagazine(BasicNewsRecipe):
|
||||
title = u'Smithsonian Magazine'
|
||||
language = 'en'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 31#days
|
||||
max_articles_per_feed = 50
|
||||
#encoding = 'latin1'
|
||||
recursions = 1
|
||||
match_regexps = ['&page=[2-9]$']
|
||||
|
||||
remove_stylesheets = True
|
||||
#remove_tags_before = dict(name='h1', attrs={'class':'heading'})
|
||||
remove_tags_after = dict(name='p', attrs={'id':'articlePaginationWrapper'})
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
dict(name='div', attrs={'class':'article_sidebar_border'}),
|
||||
dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}),
|
||||
#dict(name='ul', attrs={'class':'article-tools'}),
|
||||
dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
('History and Archeology',
|
||||
'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
|
||||
('People and Places',
|
||||
'http://feeds.feedburner.com/smithsonianmag/people-places'),
|
||||
('Science and Nature',
|
||||
'http://feeds.feedburner.com/smithsonianmag/science-nature'),
|
||||
('Arts and Culture',
|
||||
'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
|
||||
('Travel',
|
||||
'http://feeds.feedburner.com/smithsonianmag/travel'),
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
story = soup.find(name='div', attrs={'id':'article-left'})
|
||||
#td = heading.findParent(name='td')
|
||||
#td.extract()
|
||||
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
||||
body = soup.find(name='body')
|
||||
body.insert(0, story)
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
|
||||
if not first:
|
||||
for div in soup.findAll(id='article-head'): div.extract()
|
||||
return soup
|
@ -9,16 +9,35 @@ class StrategyBusinessRecipe(BasicNewsRecipe):
|
||||
title = u'Strategy+Business'
|
||||
publisher = u' Booz & Company'
|
||||
category = u'Business'
|
||||
description = u'Business magazine for senior business executives and the people who influence them.'
|
||||
description = (u'Business magazine for senior business executives and the people who influence them.'
|
||||
'Go to http://www.strategy-business.com/registration to sign up for a free account')
|
||||
|
||||
oldest_article = 13 * 7 # 3 months
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
needs_subscription = True
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
br.open('http://www.strategy-business.com/registration')
|
||||
for i, f in enumerate(br.forms()):
|
||||
if 'gatekeeper_edit' in f.name:
|
||||
br.select_form(name=f.name)
|
||||
for c in f.controls:
|
||||
if c.name.endswith('_email'):
|
||||
br[c.name] = self.username
|
||||
elif c.name.endswith('_password'):
|
||||
br[c.name] = self.password
|
||||
raw = br.submit().read()
|
||||
if '>Logout' not in raw:
|
||||
raise ValueError('Failed to login, check your username and password')
|
||||
return br
|
||||
|
||||
|
||||
extra_css = '''
|
||||
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
|
||||
a {text-decoration: none; color: blue;}
|
||||
|
@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class al(BasicNewsRecipe):
|
||||
author = 'Lorenzo Vigentini'
|
||||
description = 'the Escapist Magazine'
|
||||
description = 'The Escapist Magazine'
|
||||
|
||||
cover_url = 'http://cdn.themis-media.com/themes/escapistmagazine/default/images/logo.png'
|
||||
title = u'the Escapist Magazine'
|
||||
|
39
resources/recipes/thecultofghoul.recipe
Normal file
39
resources/recipes/thecultofghoul.recipe
Normal file
@ -0,0 +1,39 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
cultofghoul.blogspot.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TheCultOfGhoul(BasicNewsRecipe):
|
||||
title = 'The Cult of Ghoul'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Filmski blog'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'sr'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : 'film, blog, srbija, strava, uzas'
|
||||
, 'publisher': 'Dejan Ognjanovic'
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
feeds = [(u'Posts', u'http://cultofghoul.blogspot.com/feeds/posts/default')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
||||
|
||||
|
@ -50,7 +50,11 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
br.select_form(nr=0)
|
||||
br['user'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
res = br.submit()
|
||||
raw = res.read()
|
||||
if 'Welcome,' not in raw:
|
||||
raise ValueError('Failed to log in to wsj.com, check your '
|
||||
'username and password')
|
||||
return br
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
@ -69,8 +73,10 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
soup = self.wsj_get_index()
|
||||
|
||||
year = strftime('%Y')
|
||||
for x in soup.findAll('td', attrs={'class':'b14'}):
|
||||
for x in soup.findAll('td', height='25', attrs={'class':'b14'}):
|
||||
txt = self.tag_to_string(x).strip()
|
||||
txt = txt.replace(u'\xa0', ' ')
|
||||
txt = txt.encode('ascii', 'ignore')
|
||||
if year in txt:
|
||||
self.timefmt = ' [%s]'%txt
|
||||
break
|
||||
|
@ -11,7 +11,8 @@ import sys, re, os, platform
|
||||
is64bit = platform.architecture()[0] == '64bit'
|
||||
iswindows = re.search('win(32|64)', sys.platform)
|
||||
isosx = 'darwin' in sys.platform
|
||||
islinux = not isosx and not iswindows
|
||||
isfreebsd = 'freebsd' in sys.platform
|
||||
islinux = not isosx and not iswindows and not isfreebsd
|
||||
SRC = os.path.abspath('src')
|
||||
sys.path.insert(0, SRC)
|
||||
sys.resources_location = os.path.join(os.path.dirname(SRC), 'resources')
|
||||
@ -117,7 +118,7 @@ class Command(object):
|
||||
self.real_user = os.environ.get('SUDO_USER', None)
|
||||
|
||||
def drop_privileges(self):
|
||||
if not islinux or isosx:
|
||||
if not islinux or isosx or isfreebsd:
|
||||
return
|
||||
if self.real_user is not None:
|
||||
self.info('Dropping privileges to those of', self.real_user+':',
|
||||
@ -128,7 +129,7 @@ class Command(object):
|
||||
os.seteuid(int(self.real_uid))
|
||||
|
||||
def regain_privileges(self):
|
||||
if not islinux or isosx:
|
||||
if not islinux or isosx or isfreebsd:
|
||||
return
|
||||
if os.geteuid() != 0 and self.orig_euid == 0:
|
||||
self.info('Trying to get root privileges')
|
||||
|
@ -89,6 +89,7 @@ fc_inc = '/usr/include/fontconfig'
|
||||
fc_lib = '/usr/lib'
|
||||
podofo_inc = '/usr/include/podofo'
|
||||
podofo_lib = '/usr/lib'
|
||||
chmlib_inc_dirs = chmlib_lib_dirs = []
|
||||
|
||||
if iswindows:
|
||||
prefix = r'C:\cygwin\home\kovid\sw'
|
||||
@ -96,6 +97,10 @@ if iswindows:
|
||||
sw_lib_dir = os.path.join(prefix, 'lib')
|
||||
fc_inc = os.path.join(sw_inc_dir, 'fontconfig')
|
||||
fc_lib = sw_lib_dir
|
||||
chmlib_inc_dirs = consolidate('CHMLIB_INC_DIR', os.path.join(prefix,
|
||||
'build', 'chmlib-0.40', 'src'))
|
||||
chmlib_lib_dirs = consolidate('CHMLIB_LIB_DIR', os.path.join(prefix,
|
||||
'build', 'chmlib-0.40', 'src', 'Release'))
|
||||
png_inc_dirs = [sw_inc_dir]
|
||||
png_lib_dirs = [sw_lib_dir]
|
||||
png_libs = ['png12']
|
||||
|
@ -11,15 +11,16 @@ from distutils import sysconfig
|
||||
|
||||
from PyQt4.pyqtconfig import QtGuiModuleMakefile
|
||||
|
||||
from setup import Command, islinux, isosx, SRC, iswindows
|
||||
from setup.build_environment import fc_inc, fc_lib, \
|
||||
from setup import Command, islinux, isfreebsd, isosx, SRC, iswindows
|
||||
from setup.build_environment import fc_inc, fc_lib, chmlib_inc_dirs, \
|
||||
fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc, \
|
||||
podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE, \
|
||||
QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, win_ddk, \
|
||||
magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, \
|
||||
magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs, jpg_lib_dirs
|
||||
magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs, \
|
||||
jpg_lib_dirs, chmlib_lib_dirs
|
||||
MT
|
||||
isunix = islinux or isosx
|
||||
isunix = islinux or isosx or isfreebsd
|
||||
|
||||
make = 'make' if isunix else NMAKE
|
||||
|
||||
@ -56,6 +57,22 @@ if iswindows:
|
||||
pdfreflow_libs = ['advapi32', 'User32', 'Gdi32']
|
||||
|
||||
extensions = [
|
||||
|
||||
Extension('chmlib',
|
||||
['calibre/utils/chm/swig_chm.c'],
|
||||
libraries=['ChmLib' if iswindows else 'chm'],
|
||||
inc_dirs=chmlib_inc_dirs,
|
||||
lib_dirs=chmlib_lib_dirs,
|
||||
cflags=["-DSWIG_COBJECT_TYPES"]),
|
||||
|
||||
Extension('chm_extra',
|
||||
['calibre/utils/chm/extra.c'],
|
||||
libraries=['ChmLib' if iswindows else 'chm'],
|
||||
inc_dirs=chmlib_inc_dirs,
|
||||
lib_dirs=chmlib_lib_dirs,
|
||||
cflags=["-D__PYTHON__"]),
|
||||
|
||||
|
||||
Extension('pdfreflow',
|
||||
reflow_sources,
|
||||
headers=reflow_headers,
|
||||
@ -126,7 +143,7 @@ extensions = [
|
||||
if iswindows:
|
||||
extensions.append(Extension('winutil',
|
||||
['calibre/utils/windows/winutil.c'],
|
||||
libraries=['shell32', 'setupapi'],
|
||||
libraries=['shell32', 'setupapi', 'wininet'],
|
||||
cflags=['/X']
|
||||
))
|
||||
|
||||
@ -154,6 +171,13 @@ if islinux:
|
||||
ldflags.append('-lpython'+sysconfig.get_python_version())
|
||||
|
||||
|
||||
if isfreebsd:
|
||||
cflags.append('-pthread')
|
||||
ldflags.append('-shared')
|
||||
cflags.append('-I'+sysconfig.get_python_inc())
|
||||
ldflags.append('-lpython'+sysconfig.get_python_version())
|
||||
|
||||
|
||||
if isosx:
|
||||
x, p = ('i386', 'ppc')
|
||||
archs = ['-arch', x, '-arch', p, '-isysroot',
|
||||
|
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os, textwrap, subprocess, shutil, tempfile, atexit, stat, shlex
|
||||
|
||||
from setup import Command, islinux, basenames, modules, functions, \
|
||||
from setup import Command, islinux, isfreebsd, basenames, modules, functions, \
|
||||
__appname__, __version__
|
||||
|
||||
HEADER = '''\
|
||||
@ -116,7 +116,7 @@ class Develop(Command):
|
||||
|
||||
|
||||
def pre_sub_commands(self, opts):
|
||||
if not islinux:
|
||||
if not (islinux or isfreebsd):
|
||||
self.info('\nSetting up a source based development environment is only '
|
||||
'supported on linux. On other platforms, see the User Manual'
|
||||
' for help with setting up a development environment.')
|
||||
@ -156,7 +156,7 @@ class Develop(Command):
|
||||
self.warn('Failed to compile mount helper. Auto mounting of',
|
||||
' devices will not work')
|
||||
|
||||
if os.geteuid() != 0:
|
||||
if not isfreebsd and os.geteuid() != 0:
|
||||
return self.warn('Must be run as root to compile mount helper. Auto '
|
||||
'mounting of devices will not work.')
|
||||
src = os.path.join(self.SRC, 'calibre', 'devices', 'linux_mount_helper.c')
|
||||
@ -168,6 +168,7 @@ class Develop(Command):
|
||||
ret = p.wait()
|
||||
if ret != 0:
|
||||
return warn()
|
||||
if not isfreebsd:
|
||||
os.chown(dest, 0, 0)
|
||||
os.chmod(dest, stat.S_ISUID|stat.S_ISGID|stat.S_IRUSR|stat.S_IWUSR|\
|
||||
stat.S_IXUSR|stat.S_IXGRP|stat.S_IXOTH)
|
||||
|
@ -42,6 +42,7 @@ class LinuxFreeze(Command):
|
||||
'/usr/lib/liblcms.so.1',
|
||||
'/tmp/calibre-mount-helper',
|
||||
'/usr/lib/libunrar.so',
|
||||
'/usr/lib/libchm.so.0',
|
||||
'/usr/lib/libsqlite3.so.0',
|
||||
'/usr/lib/libsqlite3.so.0',
|
||||
'/usr/lib/libmng.so.1',
|
||||
|
@ -459,7 +459,7 @@ class Py2App(object):
|
||||
|
||||
@flush
|
||||
def add_misc_libraries(self):
|
||||
for x in ('usb', 'unrar', 'readline.6.0', 'wmflite-0.2.7'):
|
||||
for x in ('usb', 'unrar', 'readline.6.0', 'wmflite-0.2.7', 'chm.0'):
|
||||
info('\nAdding', x)
|
||||
x = 'lib%s.dylib'%x
|
||||
shutil.copy2(join(SW, 'lib', x), self.frameworks_dir)
|
||||
|
@ -12,7 +12,7 @@ warnings.simplefilter('ignore', DeprecationWarning)
|
||||
|
||||
|
||||
from calibre.startup import plugins, winutil, winutilerror
|
||||
from calibre.constants import iswindows, isosx, islinux, isfrozen, \
|
||||
from calibre.constants import iswindows, isosx, islinux, isfreebsd, isfrozen, \
|
||||
terminal_controller, preferred_encoding, \
|
||||
__appname__, __version__, __author__, \
|
||||
win32event, win32api, winerror, fcntl, \
|
||||
@ -22,7 +22,7 @@ import mechanize
|
||||
if False:
|
||||
winutil, winutilerror, __appname__, islinux, __version__
|
||||
fcntl, win32event, isfrozen, __author__, terminal_controller
|
||||
winerror, win32api
|
||||
winerror, win32api, isfreebsd
|
||||
|
||||
mimetypes.add_type('application/epub+zip', '.epub')
|
||||
mimetypes.add_type('text/x-sony-bbeb+xml', '.lrs')
|
||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = 'calibre'
|
||||
__version__ = '0.6.42'
|
||||
__version__ = '0.6.44'
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
import re
|
||||
@ -22,7 +22,8 @@ terminal_controller = TerminalController(sys.stdout)
|
||||
iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
|
||||
isosx = 'darwin' in sys.platform.lower()
|
||||
isnewosx = isosx and getattr(sys, 'new_app_bundle', False)
|
||||
islinux = not(iswindows or isosx)
|
||||
isfreebsd = 'freebsd' in sys.platform.lower()
|
||||
islinux = not(iswindows or isosx or isfreebsd)
|
||||
isfrozen = hasattr(sys, 'frozen')
|
||||
isunix = isosx or islinux
|
||||
|
||||
@ -56,7 +57,8 @@ if plugins is None:
|
||||
sys.path.insert(0, plugin_path)
|
||||
|
||||
for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc',
|
||||
'fontconfig', 'pdfreflow', 'progress_indicator'] + \
|
||||
'fontconfig', 'pdfreflow', 'progress_indicator', 'chmlib',
|
||||
'chm_extra'] + \
|
||||
(['winutil'] if iswindows else []) + \
|
||||
(['usbobserver'] if isosx else []):
|
||||
try:
|
||||
|
@ -119,11 +119,34 @@ class Plugin(object):
|
||||
|
||||
def __enter__(self, *args):
|
||||
if self.plugin_path is not None:
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
zf = ZipFile(self.plugin_path)
|
||||
extensions = set([x.rpartition('.')[-1].lower() for x in
|
||||
zf.namelist()])
|
||||
zip_safe = True
|
||||
for ext in ('pyd', 'so', 'dll', 'dylib'):
|
||||
if ext in extensions:
|
||||
zip_safe = False
|
||||
if zip_safe:
|
||||
sys.path.insert(0, self.plugin_path)
|
||||
self.sys_insertion_path = self.plugin_path
|
||||
else:
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip')
|
||||
self.sys_insertion_path = self._sys_insertion_tdir.__enter__(*args)
|
||||
zf.extractall(self.sys_insertion_path)
|
||||
sys.path.insert(0, self.sys_insertion_path)
|
||||
zf.close()
|
||||
|
||||
|
||||
def __exit__(self, *args):
|
||||
if self.plugin_path in sys.path:
|
||||
sys.path.remove(self.plugin_path)
|
||||
ip, it = getattr(self, 'sys_insertion_path', None), getattr(self,
|
||||
'_sys_insertion_tdir', None)
|
||||
if ip in sys.path:
|
||||
sys.path.remove(ip)
|
||||
if hasattr(it, '__exit__'):
|
||||
it.__exit__(*args)
|
||||
|
||||
|
||||
|
||||
class FileTypePlugin(Plugin):
|
||||
|
@ -70,9 +70,10 @@ class PML2PMLZ(FileTypePlugin):
|
||||
pmlz = zipfile.ZipFile(of.name, 'w')
|
||||
pmlz.write(pmlfile, os.path.basename(pmlfile))
|
||||
|
||||
pml_img = os.path.basename(pmlfile)[0] + '_img'
|
||||
img_dir = pml_img if os.path.exists(pml_img) else 'images' if \
|
||||
os.path.exists('images') else ''
|
||||
pml_img = os.path.splitext(pmlfile)[0] + '_img'
|
||||
i_img = os.path.join(os.path.dirname(pmlfile),'images')
|
||||
img_dir = pml_img if os.path.isdir(pml_img) else i_img if \
|
||||
os.path.isdir(i_img) else ''
|
||||
if img_dir:
|
||||
for image in glob.glob(os.path.join(img_dir, '*.png')):
|
||||
pmlz.write(image, os.path.join('images', (os.path.basename(image))))
|
||||
@ -81,17 +82,6 @@ class PML2PMLZ(FileTypePlugin):
|
||||
return of.name
|
||||
|
||||
|
||||
# CHM MODIFIED
|
||||
class CHMMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
name = 'Read CHM metadata'
|
||||
file_types = set(['chm'])
|
||||
description = _('Read metadata from %s files') % 'CHM'
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.metadata.chm import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
class ComicMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
name = 'Read comic metadata'
|
||||
@ -113,6 +103,17 @@ class ComicMetadataReader(MetadataReaderPlugin):
|
||||
mi.cover_data = (ext.lower(), data)
|
||||
return mi
|
||||
|
||||
class CHMMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
name = 'Read CHM metadata'
|
||||
file_types = set(['chm'])
|
||||
description = _('Read metadata from %s files') % 'CHM'
|
||||
|
||||
def get_metadata(self, stream, ftype):
|
||||
from calibre.ebooks.chm.metadata import get_metadata
|
||||
return get_metadata(stream)
|
||||
|
||||
|
||||
class EPUBMetadataReader(MetadataReaderPlugin):
|
||||
|
||||
name = 'Read EPUB metadata'
|
||||
@ -394,7 +395,7 @@ from calibre.ebooks.rtf.input import RTFInput
|
||||
from calibre.ebooks.tcr.input import TCRInput
|
||||
from calibre.ebooks.txt.input import TXTInput
|
||||
from calibre.ebooks.lrf.input import LRFInput
|
||||
from calibre.ebooks.chm.input import CHMInput # CHM MODIFIED
|
||||
from calibre.ebooks.chm.input import CHMInput
|
||||
|
||||
from calibre.ebooks.epub.output import EPUBOutput
|
||||
from calibre.ebooks.fb2.output import FB2Output
|
||||
@ -418,7 +419,7 @@ from calibre.devices.blackberry.driver import BLACKBERRY
|
||||
from calibre.devices.cybook.driver import CYBOOK
|
||||
from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \
|
||||
POCKETBOOK360, GER2, ITALICA, ECLICTO, DBOOK, INVESBOOK, \
|
||||
BOOQ
|
||||
BOOQ, ELONEX
|
||||
from calibre.devices.iliad.driver import ILIAD
|
||||
from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
|
||||
from calibre.devices.jetbook.driver import JETBOOK
|
||||
@ -433,6 +434,7 @@ from calibre.devices.nuut2.driver import NUUT2
|
||||
from calibre.devices.iriver.driver import IRIVER_STORY
|
||||
from calibre.devices.binatone.driver import README
|
||||
from calibre.devices.hanvon.driver import N516, EB511
|
||||
from calibre.devices.teclast.driver import TECLAST_K3
|
||||
|
||||
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon
|
||||
from calibre.library.catalog import CSV_XML, EPUB_MOBI
|
||||
@ -454,7 +456,7 @@ plugins += [
|
||||
TCRInput,
|
||||
TXTInput,
|
||||
LRFInput,
|
||||
CHMInput, # CHM MODIFIED
|
||||
CHMInput,
|
||||
]
|
||||
plugins += [
|
||||
EPUBOutput,
|
||||
@ -508,6 +510,8 @@ plugins += [
|
||||
README,
|
||||
N516,
|
||||
EB511,
|
||||
ELONEX,
|
||||
TECLAST_K3
|
||||
]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
x.__name__.endswith('MetadataReader')]
|
||||
|
@ -214,8 +214,21 @@ class InputFormatPlugin(Plugin):
|
||||
return ret
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
'''
|
||||
Called to allow the input plugin to perform postprocessing after
|
||||
the book has been parsed.
|
||||
'''
|
||||
pass
|
||||
|
||||
def specialize(self, oeb, opts, log, output_fmt):
|
||||
'''
|
||||
Called to allow the input plugin to specialize the parsed book
|
||||
for a particular output format. Called after postprocess_book
|
||||
and before any transforms are performed on the parsed book.
|
||||
'''
|
||||
pass
|
||||
|
||||
|
||||
class OutputFormatPlugin(Plugin):
|
||||
'''
|
||||
OutputFormatPlugins are responsible for converting an OEB document
|
||||
|
@ -235,7 +235,7 @@ class SonyReaderOutput(OutputProfile):
|
||||
description = _('This profile is intended for the SONY PRS line. '
|
||||
'The 500/505/600/700 etc.')
|
||||
|
||||
screen_size = (600, 775)
|
||||
screen_size = (590, 775)
|
||||
dpi = 168.451
|
||||
fbase = 12
|
||||
fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
|
||||
|
@ -235,6 +235,8 @@ def _run_filetype_plugins(path_to_file, ft=None, occasion='preprocess'):
|
||||
with plugin:
|
||||
try:
|
||||
nfp = plugin.run(path_to_file)
|
||||
if not nfp:
|
||||
nfp = path_to_file
|
||||
except:
|
||||
print 'Running file type plugin %s failed with traceback:'%plugin.name
|
||||
traceback.print_exc()
|
||||
@ -399,7 +401,7 @@ def initialize_plugins():
|
||||
plugin = load_plugin(zfp) if not isinstance(zfp, type) else zfp
|
||||
except PluginNotFound:
|
||||
continue
|
||||
plugin = initialize_plugin(plugin, zfp if not isinstance(zfp, type) else zfp)
|
||||
plugin = initialize_plugin(plugin, None if isinstance(zfp, type) else zfp)
|
||||
_initialized_plugins.append(plugin)
|
||||
except:
|
||||
print 'Failed to initialize plugin...'
|
||||
|
@ -23,6 +23,8 @@ Run an embedded python interpreter.
|
||||
help='Debug the specified device driver.')
|
||||
parser.add_option('-g', '--gui', default=False, action='store_true',
|
||||
help='Run the GUI',)
|
||||
parser.add_option('-w', '--viewer', default=False, action='store_true',
|
||||
help='Run the ebook viewer',)
|
||||
parser.add_option('--paths', default=False, action='store_true',
|
||||
help='Output the paths necessary to setup the calibre environment')
|
||||
parser.add_option('--migrate', action='store_true', default=False,
|
||||
@ -98,6 +100,12 @@ def main(args=sys.argv):
|
||||
if opts.gui:
|
||||
from calibre.gui2.main import main
|
||||
main(['calibre'])
|
||||
elif opts.viewer:
|
||||
from calibre.gui2.viewer.main import main
|
||||
vargs = ['ebook-viewer', '--debug-javascript']
|
||||
if len(args) > 1:
|
||||
vargs.append(args[-1])
|
||||
main(vargs)
|
||||
elif opts.command:
|
||||
sys.argv = args[:1]
|
||||
exec opts.command
|
||||
|
@ -60,8 +60,10 @@ def debug(ioreg_to_tmp=False, buf=None):
|
||||
if isosx:
|
||||
from calibre.devices.usbms.device import Device
|
||||
mount = repr(Device.osx_run_mount())
|
||||
ioreg = Device.run_ioreg()
|
||||
ioreg = 'Output from mount:\n\n'+mount+'\n\n'+ioreg
|
||||
drives = pprint.pformat(Device.osx_get_usb_drives())
|
||||
ioreg = 'Output from mount:\n'+mount+'\n\n'
|
||||
ioreg += 'Output from osx_get_usb_drives:\n'+drives+'\n\n'
|
||||
ioreg += Device.run_ioreg()
|
||||
connected_devices = []
|
||||
for dev in device_plugins():
|
||||
out('Looking for', dev.__class__.__name__)
|
||||
|
@ -15,7 +15,7 @@ class ANDROID(USBMS):
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
# Ordered list of supported formats
|
||||
FORMATS = ['epub']
|
||||
FORMATS = ['epub', 'pdf']
|
||||
|
||||
VENDOR_ID = {
|
||||
0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]},
|
||||
|
@ -195,3 +195,15 @@ class BOOQ(EB600):
|
||||
WINDOWS_MAIN_MEM = 'EB600'
|
||||
WINDOWS_CARD_A_MEM = 'EB600'
|
||||
|
||||
class ELONEX(EB600):
|
||||
|
||||
name = 'Elonex 600EB'
|
||||
gui_name = 'Elonex'
|
||||
|
||||
FORMATS = ['epub', 'pdf', 'txt', 'html']
|
||||
|
||||
VENDOR_NAME = 'ELONEX'
|
||||
WINDOWS_MAIN_MEM = 'EBOOK'
|
||||
WINDOWS_CARD_A_MEM = 'EBOOK'
|
||||
|
||||
|
||||
|
@ -6,6 +6,7 @@ the GUI. A device backend must subclass the L{Device} class. See prs500.py for
|
||||
a backend that implement the Device interface for the SONY PRS500 Reader.
|
||||
"""
|
||||
import os
|
||||
from collections import namedtuple
|
||||
|
||||
from calibre.customize import Plugin
|
||||
from calibre.constants import iswindows
|
||||
@ -43,6 +44,9 @@ class DevicePlugin(Plugin):
|
||||
#: Icon for this device
|
||||
icon = I('reader.svg')
|
||||
|
||||
# Used by gui2.ui:annotations_fetched() and devices.kindle.driver:get_annotations()
|
||||
UserAnnotation = namedtuple('Annotation','type, bookmark')
|
||||
|
||||
@classmethod
|
||||
def get_gui_name(cls):
|
||||
if hasattr(cls, 'gui_name'):
|
||||
|
@ -7,10 +7,9 @@ __docformat__ = 'restructuredtext en'
|
||||
'''
|
||||
Device driver for Amazon's Kindle
|
||||
'''
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import os, re, sys
|
||||
from cStringIO import StringIO
|
||||
from struct import unpack
|
||||
|
||||
from calibre.devices.usbms.driver import USBMS
|
||||
|
||||
@ -44,6 +43,7 @@ class KINDLE(USBMS):
|
||||
EBOOK_DIR_CARD_A = 'documents'
|
||||
DELETE_EXTS = ['.mbp']
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
SUPPORTS_ANNOTATIONS = True
|
||||
|
||||
WIRELESS_FILE_NAME_PATTERN = re.compile(
|
||||
r'(?P<title>[^-]+)-asin_(?P<asin>[a-zA-Z\d]{10,})-type_(?P<type>\w{4})-v_(?P<index>\d+).*')
|
||||
@ -60,6 +60,73 @@ class KINDLE(USBMS):
|
||||
'replace')
|
||||
return mi
|
||||
|
||||
def get_annotations(self, path_map):
|
||||
MBP_FORMATS = [u'azw', u'mobi', u'prc', u'txt']
|
||||
TAN_FORMATS = [u'tpz', u'azw1']
|
||||
|
||||
mbp_formats = set()
|
||||
for fmt in MBP_FORMATS:
|
||||
mbp_formats.add(fmt)
|
||||
tan_formats = set()
|
||||
for fmt in TAN_FORMATS:
|
||||
tan_formats.add(fmt)
|
||||
|
||||
def get_storage():
|
||||
storage = []
|
||||
if self._main_prefix:
|
||||
storage.append(os.path.join(self._main_prefix, self.EBOOK_DIR_MAIN))
|
||||
if self._card_a_prefix:
|
||||
storage.append(os.path.join(self._card_a_prefix, self.EBOOK_DIR_CARD_A))
|
||||
if self._card_b_prefix:
|
||||
storage.append(os.path.join(self._card_b_prefix, self.EBOOK_DIR_CARD_B))
|
||||
return storage
|
||||
|
||||
def resolve_bookmark_paths(storage, path_map):
|
||||
pop_list = []
|
||||
book_ext = {}
|
||||
for id in path_map:
|
||||
file_fmts = set()
|
||||
for fmt in path_map[id]['fmts']:
|
||||
file_fmts.add(fmt)
|
||||
|
||||
bookmark_extension = None
|
||||
if file_fmts.intersection(mbp_formats):
|
||||
book_extension = list(file_fmts.intersection(mbp_formats))[0]
|
||||
bookmark_extension = 'mbp'
|
||||
elif file_fmts.intersection(tan_formats):
|
||||
book_extension = list(file_fmts.intersection(tan_formats))[0]
|
||||
bookmark_extension = 'tan'
|
||||
|
||||
if bookmark_extension:
|
||||
for vol in storage:
|
||||
bkmk_path = path_map[id]['path'].replace(os.path.abspath('/<storage>'),vol)
|
||||
bkmk_path = bkmk_path.replace('bookmark',bookmark_extension)
|
||||
if os.path.exists(bkmk_path):
|
||||
path_map[id] = bkmk_path
|
||||
book_ext[id] = book_extension
|
||||
break
|
||||
else:
|
||||
pop_list.append(id)
|
||||
else:
|
||||
pop_list.append(id)
|
||||
|
||||
# Remove non-existent bookmark templates
|
||||
for id in pop_list:
|
||||
path_map.pop(id)
|
||||
return path_map, book_ext
|
||||
|
||||
storage = get_storage()
|
||||
path_map, book_ext = resolve_bookmark_paths(storage, path_map)
|
||||
|
||||
bookmarked_books = {}
|
||||
for id in path_map:
|
||||
bookmark_ext = path_map[id].rpartition('.')[2]
|
||||
myBookmark = Bookmark(path_map[id], id, book_ext[id], bookmark_ext)
|
||||
bookmarked_books[id] = self.UserAnnotation(type='kindle', bookmark=myBookmark)
|
||||
|
||||
# This returns as job.result in gui2.ui.annotations_fetched(self,job)
|
||||
return bookmarked_books
|
||||
|
||||
|
||||
class KINDLE2(KINDLE):
|
||||
|
||||
@ -79,3 +146,213 @@ class KINDLE_DX(KINDLE2):
|
||||
|
||||
PRODUCT_ID = [0x0003]
|
||||
BCD = [0x0100]
|
||||
|
||||
class Bookmark():
|
||||
'''
|
||||
A simple class fetching bookmark data
|
||||
Kindle-specific
|
||||
'''
|
||||
def __init__(self, path, id, book_format, bookmark_extension):
|
||||
self.book_format = book_format
|
||||
self.bookmark_extension = bookmark_extension
|
||||
self.book_length = 0
|
||||
self.id = id
|
||||
self.last_read = 0
|
||||
self.last_read_location = 0
|
||||
self.timestamp = 0
|
||||
self.user_notes = None
|
||||
|
||||
self.get_bookmark_data(path)
|
||||
self.get_book_length(path)
|
||||
try:
|
||||
self.percent_read = float(100*self.last_read / self.book_length)
|
||||
except:
|
||||
self.percent_read = 0
|
||||
|
||||
def record(self, n):
|
||||
from calibre.ebooks.metadata.mobi import StreamSlicer
|
||||
if n >= self.nrecs:
|
||||
raise ValueError('non-existent record %r' % n)
|
||||
offoff = 78 + (8 * n)
|
||||
start, = unpack('>I', self.data[offoff + 0:offoff + 4])
|
||||
stop = None
|
||||
if n < (self.nrecs - 1):
|
||||
stop, = unpack('>I', self.data[offoff + 8:offoff + 12])
|
||||
return StreamSlicer(self.stream, start, stop)
|
||||
|
||||
def get_bookmark_data(self, path):
|
||||
''' Return the timestamp and last_read_location '''
|
||||
from calibre.ebooks.metadata.mobi import StreamSlicer
|
||||
user_notes = {}
|
||||
if self.bookmark_extension == 'mbp':
|
||||
MAGIC_MOBI_CONSTANT = 150
|
||||
with open(path,'rb') as f:
|
||||
stream = StringIO(f.read())
|
||||
data = StreamSlicer(stream)
|
||||
self.timestamp, = unpack('>I', data[0x24:0x28])
|
||||
bpar_offset, = unpack('>I', data[0x4e:0x52])
|
||||
lrlo = bpar_offset + 0x0c
|
||||
self.last_read = int(unpack('>I', data[lrlo:lrlo+4])[0])
|
||||
self.last_read_location = self.last_read/MAGIC_MOBI_CONSTANT + 1
|
||||
entries, = unpack('>I', data[0x4a:0x4e])
|
||||
|
||||
# Store the annotations/locations
|
||||
bpl = bpar_offset + 4
|
||||
bpar_len, = unpack('>I', data[bpl:bpl+4])
|
||||
bpar_len += 8
|
||||
#print "bpar_len: 0x%x" % bpar_len
|
||||
eo = bpar_offset + bpar_len
|
||||
|
||||
# Walk bookmark entries
|
||||
#print " --- %s --- " % path
|
||||
current_entry = 1
|
||||
sig = data[eo:eo+4]
|
||||
previous_block = None
|
||||
|
||||
while sig == 'DATA':
|
||||
text = None
|
||||
entry_type = None
|
||||
rec_len, = unpack('>I', data[eo+4:eo+8])
|
||||
if rec_len == 0:
|
||||
current_block = "empty_data"
|
||||
elif data[eo+8:eo+12] == "EBAR":
|
||||
current_block = "data_header"
|
||||
#entry_type = "data_header"
|
||||
location, = unpack('>I', data[eo+0x34:eo+0x38])
|
||||
#print "data_header location: %d" % location
|
||||
else:
|
||||
current_block = "text_block"
|
||||
if previous_block == 'empty_data':
|
||||
entry_type = 'Note'
|
||||
elif previous_block == 'data_header':
|
||||
entry_type = 'Highlight'
|
||||
text = data[eo+8:eo+8+rec_len].decode('utf-16-be')
|
||||
|
||||
if entry_type:
|
||||
displayed_location = location/MAGIC_MOBI_CONSTANT + 1
|
||||
user_notes[location] = dict(id=self.id,
|
||||
displayed_location=displayed_location,
|
||||
type=entry_type,
|
||||
text=text)
|
||||
|
||||
eo += rec_len + 8
|
||||
current_entry += 1
|
||||
previous_block = current_block
|
||||
sig = data[eo:eo+4]
|
||||
|
||||
while sig == 'BKMK':
|
||||
# Fix start location for Highlights using BKMK data
|
||||
end_loc, = unpack('>I', data[eo+0x10:eo+0x14])
|
||||
if end_loc in user_notes and user_notes[end_loc]['type'] == 'Highlight':
|
||||
start, = unpack('>I', data[eo+8:eo+12])
|
||||
user_notes[start] = user_notes[end_loc]
|
||||
user_notes.pop(end_loc)
|
||||
elif end_loc in user_notes and user_notes[end_loc]['type'] == 'Note':
|
||||
# Skip duplicate bookmarks for notes
|
||||
pass
|
||||
else:
|
||||
# If a bookmark coincides with a user annotation, the locs could
|
||||
# be the same - cheat by nudging -1
|
||||
# Skip bookmark for last_read_location
|
||||
if end_loc != self.last_read:
|
||||
displayed_location = end_loc/MAGIC_MOBI_CONSTANT + 1
|
||||
user_notes[end_loc - 1] = dict(id=self.id,
|
||||
displayed_location=displayed_location,
|
||||
type='Bookmark',
|
||||
text=None)
|
||||
rec_len, = unpack('>I', data[eo+4:eo+8])
|
||||
eo += rec_len + 8
|
||||
sig = data[eo:eo+4]
|
||||
|
||||
elif self.bookmark_extension == 'tan':
|
||||
# TAN bookmarks
|
||||
MAGIC_TOPAZ_CONSTANT = 33.33
|
||||
self.timestamp = os.path.getmtime(path)
|
||||
with open(path,'rb') as f:
|
||||
stream = StringIO(f.read())
|
||||
data = StreamSlicer(stream)
|
||||
self.last_read = int(unpack('>I', data[5:9])[0])
|
||||
self.last_read_location = self.last_read/MAGIC_TOPAZ_CONSTANT + 1
|
||||
entries, = unpack('>I', data[9:13])
|
||||
current_entry = 0
|
||||
e_base = 0x0d
|
||||
while current_entry < entries:
|
||||
location, = unpack('>I', data[e_base+2:e_base+6])
|
||||
text = None
|
||||
text_len, = unpack('>I', data[e_base+0xA:e_base+0xE])
|
||||
e_type, = unpack('>B', data[e_base+1])
|
||||
if e_type == 0:
|
||||
e_type = 'Bookmark'
|
||||
elif e_type == 1:
|
||||
e_type = 'Highlight'
|
||||
text = "(Topaz highlights not yet supported)"
|
||||
elif e_type == 2:
|
||||
e_type = 'Note'
|
||||
text = data[e_base+0x10:e_base+0x10+text_len]
|
||||
else:
|
||||
e_type = 'Unknown annotation type'
|
||||
|
||||
if self.book_format in ['tpz','azw1']:
|
||||
# *** This needs fine-tuning
|
||||
displayed_location = location/MAGIC_TOPAZ_CONSTANT + 1
|
||||
elif self.book_format == 'pdf':
|
||||
# *** This needs testing
|
||||
displayed_location = location
|
||||
user_notes[location] = dict(id=self.id,
|
||||
displayed_location=displayed_location,
|
||||
type=e_type,
|
||||
text=text)
|
||||
if text_len == 0xFFFFFFFF:
|
||||
e_base = e_base + 14
|
||||
else:
|
||||
e_base = e_base + 14 + 2 + text_len
|
||||
current_entry += 1
|
||||
for location in user_notes:
|
||||
if location == self.last_read:
|
||||
user_notes.pop(location)
|
||||
break
|
||||
else:
|
||||
print "unsupported bookmark_extension: %s" % self.bookmark_extension
|
||||
self.user_notes = user_notes
|
||||
|
||||
'''
|
||||
for location in sorted(user_notes):
|
||||
print ' Location %d: %s\n%s' % (user_notes[location]['displayed_location'],
|
||||
user_notes[location]['type'],
|
||||
'\n'.join(self.textdump(user_notes[location]['text'])))
|
||||
'''
|
||||
|
||||
def get_book_length(self, path):
|
||||
from calibre.ebooks.metadata.mobi import StreamSlicer
|
||||
book_fs = path.replace('.%s' % self.bookmark_extension,'.%s' % self.book_format)
|
||||
|
||||
self.book_length = 0
|
||||
if self.bookmark_extension == 'mbp':
|
||||
# Read the book len from the header
|
||||
with open(book_fs,'rb') as f:
|
||||
self.stream = StringIO(f.read())
|
||||
self.data = StreamSlicer(self.stream)
|
||||
self.nrecs, = unpack('>H', self.data[76:78])
|
||||
record0 = self.record(0)
|
||||
self.book_length = int(unpack('>I', record0[0x04:0x08])[0])
|
||||
elif self.bookmark_extension == 'tan':
|
||||
# Read bookLength from metadata
|
||||
with open(book_fs,'rb') as f:
|
||||
stream = StringIO(f.read())
|
||||
raw = stream.read(8*1024)
|
||||
if not raw.startswith('TPZ'):
|
||||
raise ValueError('Not a Topaz file')
|
||||
first = raw.find('metadata')
|
||||
if first < 0:
|
||||
raise ValueError('Invalid Topaz file')
|
||||
second = raw.find('metadata', first+10)
|
||||
if second < 0:
|
||||
raise ValueError('Invalid Topaz file')
|
||||
raw = raw[second:second+1000]
|
||||
idx = raw.find('bookLength')
|
||||
if idx > -1:
|
||||
length = ord(raw[idx+len('bookLength')])
|
||||
self.book_length = int(raw[idx+len('bookLength')+1:idx+len('bookLength')+1+length])
|
||||
|
||||
else:
|
||||
print "unsupported bookmark_extension: %s" % self.bookmark_extension
|
||||
|
@ -8,10 +8,10 @@ from ctypes import cdll, POINTER, byref, pointer, Structure as _Structure, \
|
||||
c_ubyte, c_ushort, c_int, c_char, c_void_p, c_byte, c_uint
|
||||
from errno import EBUSY, ENOMEM
|
||||
|
||||
from calibre import iswindows, isosx, load_library
|
||||
from calibre import iswindows, isosx, isfreebsd, load_library
|
||||
|
||||
_libusb_name = 'libusb'
|
||||
PATH_MAX = 511 if iswindows else 1024 if isosx else 4096
|
||||
PATH_MAX = 511 if iswindows else 1024 if (isosx or isfreebsd) else 4096
|
||||
if iswindows:
|
||||
class Structure(_Structure):
|
||||
_pack_ = 1
|
||||
|
10
src/calibre/devices/teclast/__init__.py
Normal file
10
src/calibre/devices/teclast/__init__.py
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
42
src/calibre/devices/teclast/driver.py
Normal file
42
src/calibre/devices/teclast/driver.py
Normal file
@ -0,0 +1,42 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.devices.usbms.driver import USBMS
|
||||
|
||||
class TECLAST_K3(USBMS):
|
||||
|
||||
name = 'Teclast K3 Device Interface'
|
||||
gui_name = 'K3'
|
||||
description = _('Communicate with the Teclast K3 reader.')
|
||||
author = 'Kovid Goyal'
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
# Ordered list of supported formats
|
||||
FORMATS = ['epub', 'fb2', 'doc', 'pdf', 'txt']
|
||||
|
||||
VENDOR_ID = [0x071b]
|
||||
PRODUCT_ID = [0x3203]
|
||||
BCD = [0x0000]
|
||||
|
||||
VENDOR_NAME = 'TECLAST'
|
||||
WINDOWS_MAIN_MEM = 'DIGITAL_PLAYER'
|
||||
WINDOWS_CARD_A_MEM = 'DIGITAL_PLAYER'
|
||||
|
||||
MAIN_MEMORY_VOLUME_LABEL = 'K3 Main Memory'
|
||||
STORAGE_CARD_VOLUME_LABEL = 'K3 Storage Card'
|
||||
|
||||
EBOOK_DIR_MAIN = ''
|
||||
EBOOK_DIR_CARD_A = ''
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
def windows_sort_drives(self, drives):
|
||||
main = drives.get('main', None)
|
||||
card = drives.get('carda', None)
|
||||
if card and main and card < main:
|
||||
drives['main'] = card
|
||||
drives['carda'] = main
|
||||
|
||||
return drives
|
||||
|
||||
|
@ -4,8 +4,7 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import os, shutil, time
|
||||
|
||||
from calibre.devices.errors import PathError
|
||||
|
||||
@ -50,11 +49,12 @@ class CLI(object):
|
||||
d = os.path.dirname(path)
|
||||
if not os.path.exists(d):
|
||||
os.makedirs(d)
|
||||
with open(path, 'wb') as dest:
|
||||
with open(path, 'w+b') as dest:
|
||||
try:
|
||||
shutil.copyfileobj(infile, dest)
|
||||
except IOError:
|
||||
print 'WARNING: First attempt to send file to device failed'
|
||||
time.sleep(0.2)
|
||||
infile.seek(0)
|
||||
dest.seek(0)
|
||||
dest.truncate()
|
||||
|
@ -17,6 +17,7 @@ import time
|
||||
import re
|
||||
import sys
|
||||
import glob
|
||||
|
||||
from itertools import repeat
|
||||
|
||||
from calibre.devices.interface import DevicePlugin
|
||||
@ -333,10 +334,14 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
raise
|
||||
time.sleep(2)
|
||||
|
||||
def _osx_bsd_names(self):
|
||||
@classmethod
|
||||
def osx_get_usb_drives(cls):
|
||||
if usbobserver_err:
|
||||
raise RuntimeError('Failed to load usbobserver: '+usbobserver_err)
|
||||
drives = usbobserver.get_usb_drives()
|
||||
return usbobserver.get_usb_drives()
|
||||
|
||||
def _osx_bsd_names(self):
|
||||
drives = self.osx_get_usb_drives()
|
||||
matches = []
|
||||
d = self.detected_device
|
||||
if d.serial:
|
||||
@ -394,16 +399,6 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
if len(matches) > 2:
|
||||
drives['cardb'] = matches[2]
|
||||
|
||||
pat = self.OSX_MAIN_MEM_VOL_PAT
|
||||
if pat is not None and len(drives) > 1 and 'main' in drives:
|
||||
if pat.search(drives['main']) is None:
|
||||
main = drives['main']
|
||||
for x in ('carda', 'cardb'):
|
||||
if x in drives and pat.search(drives[x]):
|
||||
drives['main'] = drives.pop(x)
|
||||
drives[x] = main
|
||||
break
|
||||
|
||||
return drives
|
||||
|
||||
def osx_bsd_names(self):
|
||||
@ -427,6 +422,16 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
if drives['main'] is None:
|
||||
print bsd_drives, mount_map, drives
|
||||
raise DeviceError(_('Unable to detect the %s mount point. Try rebooting.')%self.__class__.__name__)
|
||||
pat = self.OSX_MAIN_MEM_VOL_PAT
|
||||
if pat is not None and len(drives) > 1 and 'main' in drives:
|
||||
if pat.search(drives['main']) is None:
|
||||
main = drives['main']
|
||||
for x in ('carda', 'cardb'):
|
||||
if x in drives and pat.search(drives[x]):
|
||||
drives['main'] = drives.pop(x)
|
||||
drives[x] = main
|
||||
break
|
||||
|
||||
self._main_prefix = drives['main']+os.sep
|
||||
def get_card_prefix(c):
|
||||
ans = drives.get(c, None)
|
||||
@ -789,7 +794,13 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
'''
|
||||
return components
|
||||
|
||||
def create_upload_path(self, path, mdata, fname):
|
||||
def get_annotations(self, path_map):
|
||||
'''
|
||||
Resolve path_map to annotation_map of files found on the device
|
||||
'''
|
||||
return {}
|
||||
|
||||
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
||||
path = os.path.abspath(path)
|
||||
extra_components = []
|
||||
|
||||
@ -848,7 +859,7 @@ class Device(DeviceConfig, DevicePlugin):
|
||||
filedir = os.path.dirname(filepath)
|
||||
|
||||
|
||||
if not os.path.exists(filedir):
|
||||
if create_dirs and not os.path.exists(filedir):
|
||||
os.makedirs(filedir)
|
||||
|
||||
return filepath
|
||||
|
@ -123,7 +123,7 @@ class USBMS(CLI, Device):
|
||||
'''
|
||||
:path: the full path were the associated book is located.
|
||||
:filename: the name of the book file without the extension.
|
||||
:metatdata: metadata belonging to the book. Use metadata.thumbnail
|
||||
:metadata: metadata belonging to the book. Use metadata.thumbnail
|
||||
for cover
|
||||
'''
|
||||
pass
|
||||
|
@ -129,3 +129,12 @@ def render_html(path_to_html, width=590, height=750):
|
||||
del loop
|
||||
return renderer
|
||||
|
||||
def check_ebook_format(stream, current_guess):
|
||||
ans = current_guess
|
||||
if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1'):
|
||||
stream.seek(0)
|
||||
if stream.read(3) == 'TPZ':
|
||||
ans = 'tpz'
|
||||
stream.seek(0)
|
||||
return ans
|
||||
|
||||
|
@ -53,13 +53,15 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||
"x-sjis" : "shift-jis" }
|
||||
|
||||
|
||||
def force_encoding(raw, verbose):
|
||||
def force_encoding(raw, verbose, assume_utf8=False):
|
||||
from calibre.constants import preferred_encoding
|
||||
try:
|
||||
chardet = detect(raw)
|
||||
except:
|
||||
chardet = {'encoding':preferred_encoding, 'confidence':0}
|
||||
encoding = chardet['encoding']
|
||||
if chardet['confidence'] < 1 and assume_utf8:
|
||||
encoding = 'utf-8'
|
||||
if chardet['confidence'] < 1 and verbose:
|
||||
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||
if not encoding:
|
||||
@ -73,7 +75,7 @@ def force_encoding(raw, verbose):
|
||||
|
||||
|
||||
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||
resolve_entities=False):
|
||||
resolve_entities=False, assume_utf8=False):
|
||||
'''
|
||||
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
||||
encoding declaration first, if not found uses the chardet library and
|
||||
@ -95,7 +97,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||
encoding = match.group(1)
|
||||
break
|
||||
if encoding is None:
|
||||
encoding = force_encoding(raw, verbose)
|
||||
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
|
||||
try:
|
||||
if encoding.lower().strip() == 'macintosh':
|
||||
encoding = 'mac-roman'
|
||||
|
@ -1,213 +1,17 @@
|
||||
from __future__ import with_statement
|
||||
''' CHM File decoding support '''
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
||||
' and Alex Bramley <a.bramley at gmail.com>.'
|
||||
|
||||
import os, shutil, uuid, re
|
||||
from tempfile import mkdtemp
|
||||
from mimetypes import guess_type as guess_mimetype
|
||||
import os, uuid
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, NavigableString
|
||||
from lxml import html
|
||||
from pychm.chm import CHMFile
|
||||
from pychm.chmlib import (
|
||||
CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
|
||||
chm_enumerate,
|
||||
)
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
|
||||
|
||||
def match_string(s1, s2_already_lowered):
|
||||
if s1 is not None and s2_already_lowered is not None:
|
||||
if s1.lower()==s2_already_lowered:
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_all_prev_empty(tag):
|
||||
if tag is None:
|
||||
return True
|
||||
if tag.__class__ == NavigableString and not check_empty(tag):
|
||||
return False
|
||||
return check_all_prev_empty(tag.previousSibling)
|
||||
|
||||
def check_empty(s, rex = re.compile(r'\S')):
|
||||
return rex.search(s) is None
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(usage=_('%prog [options] mybook.chm'))
|
||||
parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
|
||||
parser.add_option('--verbose', default=False, action='store_true', dest='verbose')
|
||||
parser.add_option("-t", "--title", action="store", type="string", \
|
||||
dest="title", help=_("Set the book title"))
|
||||
parser.add_option('--title-sort', action='store', type='string', default=None,
|
||||
dest='title_sort', help=_('Set sort key for the title'))
|
||||
parser.add_option("-a", "--author", action="store", type="string", \
|
||||
dest="author", help=_("Set the author"))
|
||||
parser.add_option('--author-sort', action='store', type='string', default=None,
|
||||
dest='author_sort', help=_('Set sort key for the author'))
|
||||
parser.add_option("-c", "--category", action="store", type="string", \
|
||||
dest="category", help=_("The category this book belongs"
|
||||
" to. E.g.: History"))
|
||||
parser.add_option("--thumbnail", action="store", type="string", \
|
||||
dest="thumbnail", help=_("Path to a graphic that will be"
|
||||
" set as this files' thumbnail"))
|
||||
parser.add_option("--comment", action="store", type="string", \
|
||||
dest="freetext", help=_("Path to a txt file containing a comment."))
|
||||
parser.add_option("--get-thumbnail", action="store_true", \
|
||||
dest="get_thumbnail", default=False, \
|
||||
help=_("Extract thumbnail from LRF file"))
|
||||
parser.add_option('--publisher', default=None, help=_('Set the publisher'))
|
||||
parser.add_option('--classification', default=None, help=_('Set the book classification'))
|
||||
parser.add_option('--creator', default=None, help=_('Set the book creator'))
|
||||
parser.add_option('--producer', default=None, help=_('Set the book producer'))
|
||||
parser.add_option('--get-cover', action='store_true', default=False,
|
||||
help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.'))
|
||||
parser.add_option('--bookid', action='store', type='string', default=None,
|
||||
dest='book_id', help=_('Set book ID'))
|
||||
parser.add_option('--font-delta', action='store', type='int', default=0,
|
||||
dest='font_delta', help=_('Set font delta'))
|
||||
return parser
|
||||
|
||||
class CHMError(Exception):
|
||||
pass
|
||||
|
||||
class CHMReader(CHMFile):
|
||||
def __init__(self, input, log):
|
||||
CHMFile.__init__(self)
|
||||
if not self.LoadCHM(input):
|
||||
raise CHMError("Unable to open CHM file '%s'"%(input,))
|
||||
self.log = log
|
||||
self._sourcechm = input
|
||||
self._contents = None
|
||||
self._playorder = 0
|
||||
self._metadata = False
|
||||
self._extracted = False
|
||||
|
||||
# location of '.hhc' file, which is the CHM TOC.
|
||||
self.root, ext = os.path.splitext(self.topics.lstrip('/'))
|
||||
self.hhc_path = self.root + ".hhc"
|
||||
|
||||
|
||||
def _parse_toc(self, ul, basedir=os.getcwdu()):
|
||||
toc = TOC(play_order=self._playorder, base_path=basedir, text='')
|
||||
self._playorder += 1
|
||||
for li in ul('li', recursive=False):
|
||||
href = li.object('param', {'name': 'Local'})[0]['value']
|
||||
if href.count('#'):
|
||||
href, frag = href.split('#')
|
||||
else:
|
||||
frag = None
|
||||
name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
|
||||
#print "========>", name
|
||||
toc.add_item(href, frag, name, play_order=self._playorder)
|
||||
self._playorder += 1
|
||||
if li.ul:
|
||||
child = self._parse_toc(li.ul)
|
||||
child.parent = toc
|
||||
toc.append(child)
|
||||
#print toc
|
||||
return toc
|
||||
|
||||
|
||||
def GetFile(self, path):
|
||||
# have to have abs paths for ResolveObject, but Contents() deliberately
|
||||
# makes them relative. So we don't have to worry, re-add the leading /.
|
||||
# note this path refers to the internal CHM structure
|
||||
if path[0] != '/':
|
||||
path = '/' + path
|
||||
res, ui = self.ResolveObject(path)
|
||||
if res != CHM_RESOLVE_SUCCESS:
|
||||
raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
|
||||
size, data = self.RetrieveObject(ui)
|
||||
if size == 0:
|
||||
raise CHMError("'%s' is zero bytes in length!"%(path,))
|
||||
return data
|
||||
|
||||
def ExtractFiles(self, output_dir=os.getcwdu()):
|
||||
for path in self.Contents():
|
||||
lpath = os.path.join(output_dir, path)
|
||||
self._ensure_dir(lpath)
|
||||
data = self.GetFile(path)
|
||||
with open(lpath, 'wb') as f:
|
||||
if guess_mimetype(path)[0] == ('text/html'):
|
||||
data = self._reformat(data)
|
||||
f.write(data)
|
||||
#subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
|
||||
self._extracted = True
|
||||
|
||||
def _reformat(self, data):
|
||||
try:
|
||||
soup = BeautifulSoup(data)
|
||||
except UnicodeEncodeError:
|
||||
# hit some strange encoding problems...
|
||||
print "Unable to parse html for cleaning, leaving it :("
|
||||
return data
|
||||
# nuke javascript...
|
||||
[s.extract() for s in soup('script')]
|
||||
# remove forward and back nav bars from the top/bottom of each page
|
||||
# cos they really fuck with the flow of things and generally waste space
|
||||
# since we can't use [a,b] syntax to select arbitrary items from a list
|
||||
# we'll have to do this manually...
|
||||
t = soup('table')
|
||||
if t:
|
||||
if (t[0].previousSibling is None
|
||||
or t[0].previousSibling.previousSibling is None):
|
||||
t[0].extract()
|
||||
if (t[-1].nextSibling is None
|
||||
or t[-1].nextSibling.nextSibling is None):
|
||||
t[-1].extract()
|
||||
# for some very odd reason each page's content appears to be in a table
|
||||
# too. and this table has sub-tables for random asides... grr.
|
||||
|
||||
# remove br at top of page if present after nav bars removed
|
||||
br = soup('br')
|
||||
if br:
|
||||
if check_all_prev_empty(br[0].previousSibling):
|
||||
br[0].extract()
|
||||
|
||||
# some images seem to be broken in some chm's :/
|
||||
for img in soup('img'):
|
||||
try:
|
||||
# some are supposedly "relative"... lies.
|
||||
while img['src'].startswith('../'): img['src'] = img['src'][3:]
|
||||
# some have ";<junk>" at the end.
|
||||
img['src'] = img['src'].split(';')[0]
|
||||
except KeyError:
|
||||
# and some don't even have a src= ?!
|
||||
pass
|
||||
# now give back some pretty html.
|
||||
return soup.prettify()
|
||||
|
||||
def Contents(self):
|
||||
if self._contents is not None:
|
||||
return self._contents
|
||||
paths = []
|
||||
def get_paths(chm, ui, ctx):
|
||||
# skip directories
|
||||
# note this path refers to the internal CHM structure
|
||||
if ui.path[-1] != '/':
|
||||
# and make paths relative
|
||||
paths.append(ui.path.lstrip('/'))
|
||||
chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
|
||||
self._contents = paths
|
||||
return self._contents
|
||||
|
||||
def _ensure_dir(self, path):
|
||||
dir = os.path.dirname(path)
|
||||
if not os.path.isdir(dir):
|
||||
os.makedirs(dir)
|
||||
|
||||
def extract_content(self, output_dir=os.getcwdu()):
|
||||
self.ExtractFiles(output_dir=output_dir)
|
||||
|
||||
|
||||
class CHMInput(InputFormatPlugin):
|
||||
|
||||
name = 'CHM Input'
|
||||
@ -215,25 +19,22 @@ class CHMInput(InputFormatPlugin):
|
||||
description = 'Convert CHM files to OEB'
|
||||
file_types = set(['chm'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='dummy_option', recommended_value=False,
|
||||
help=_('dummy option until real options are determined.')),
|
||||
])
|
||||
|
||||
def _chmtohtml(self, output_dir, chm_path, no_images, log):
|
||||
from calibre.ebooks.chm.reader import CHMReader
|
||||
log.debug('Opening CHM file')
|
||||
rdr = CHMReader(chm_path, log)
|
||||
log.debug('Extracting CHM to %s' % output_dir)
|
||||
rdr.extract_content(output_dir)
|
||||
self._chm_reader = rdr
|
||||
return rdr.hhc_path
|
||||
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.metadata.chm import get_metadata_
|
||||
from calibre.ebooks.chm.metadata import get_metadata_from_reader
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
log.debug('Processing CHM...')
|
||||
tdir = mkdtemp(prefix='chm2oeb_')
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
with TemporaryDirectory('_chm2oeb') as tdir:
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
@ -248,8 +49,9 @@ class CHMInput(InputFormatPlugin):
|
||||
log.debug('stream.name=%s' % stream.name)
|
||||
mainname = self._chmtohtml(tdir, chm_name, no_images, log)
|
||||
mainpath = os.path.join(tdir, mainname)
|
||||
#raw_input()
|
||||
|
||||
metadata = get_metadata_(tdir)
|
||||
metadata = get_metadata_from_reader(self._chm_reader)
|
||||
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
@ -260,7 +62,6 @@ class CHMInput(InputFormatPlugin):
|
||||
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
|
||||
options.debug_pipeline = odi
|
||||
#log.debug('DEBUG: Not removing tempdir %s' % tdir)
|
||||
shutil.rmtree(tdir)
|
||||
return oeb
|
||||
|
||||
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
|
||||
@ -369,6 +170,9 @@ class CHMInput(InputFormatPlugin):
|
||||
# check that node is a normal node (not a comment, DOCTYPE, etc.)
|
||||
# (normal nodes have string tags)
|
||||
if isinstance(node.tag, basestring):
|
||||
from calibre.ebooks.chm.reader import match_string
|
||||
|
||||
chapter_path = None
|
||||
if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
|
||||
for child in node:
|
||||
if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
|
||||
|
157
src/calibre/ebooks/chm/metadata.py
Normal file
157
src/calibre/ebooks/chm/metadata.py
Normal file
@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import with_statement
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import string_to_authors, MetaInformation
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.ptempfile import TemporaryFile
|
||||
|
||||
def _clean(s):
|
||||
return s.replace(u'\u00a0', u' ')
|
||||
|
||||
def _detag(tag):
|
||||
str = u""
|
||||
for elem in tag:
|
||||
if hasattr(elem, "contents"):
|
||||
str += _detag(elem)
|
||||
else:
|
||||
str += _clean(elem)
|
||||
return str
|
||||
|
||||
|
||||
def _metadata_from_table(soup, searchfor):
|
||||
td = soup.find('td', text=re.compile(searchfor, flags=re.I))
|
||||
if td is None:
|
||||
return None
|
||||
td = td.parent
|
||||
# there appears to be multiple ways of structuring the metadata
|
||||
# on the home page. cue some nasty special-case hacks...
|
||||
if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
|
||||
meta = _detag(td.findNextSibling('td'))
|
||||
return re.sub('^:', '', meta).strip()
|
||||
else:
|
||||
meta = _detag(td)
|
||||
return re.sub(r'^[^:]+:', '', meta).strip()
|
||||
|
||||
def _metadata_from_span(soup, searchfor):
|
||||
span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
|
||||
if span is None:
|
||||
return None
|
||||
# this metadata might need some cleaning up still :/
|
||||
return _detag(span.renderContents().strip())
|
||||
|
||||
def _get_authors(soup):
|
||||
aut = (_metadata_from_span(soup, r'author')
|
||||
or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
|
||||
ans = [_('Unknown')]
|
||||
if aut is not None:
|
||||
ans = string_to_authors(aut)
|
||||
return ans
|
||||
|
||||
def _get_publisher(soup):
|
||||
return (_metadata_from_span(soup, 'imprint')
|
||||
or _metadata_from_table(soup, 'publisher'))
|
||||
|
||||
def _get_isbn(soup):
|
||||
return (_metadata_from_span(soup, 'isbn')
|
||||
or _metadata_from_table(soup, 'isbn'))
|
||||
|
||||
def _get_comments(soup):
|
||||
date = (_metadata_from_span(soup, 'cwdate')
|
||||
or _metadata_from_table(soup, 'pub date'))
|
||||
pages = ( _metadata_from_span(soup, 'pages')
|
||||
or _metadata_from_table(soup, 'pages'))
|
||||
try:
|
||||
# date span can have copyright symbols in it...
|
||||
date = date.replace(u'\u00a9', '').strip()
|
||||
# and pages often comes as '(\d+ pages)'
|
||||
pages = re.search(r'\d+', pages).group(0)
|
||||
return u'Published %s, %s pages.' % (date, pages)
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _get_cover(soup, rdr):
|
||||
ans = None
|
||||
try:
|
||||
ans = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
|
||||
except TypeError:
|
||||
# meeehh, no handy alt-tag goodness, try some hackery
|
||||
# the basic idea behind this is that in general, the cover image
|
||||
# has a height:width ratio of ~1.25, whereas most of the nav
|
||||
# buttons are decidedly less than that.
|
||||
# what we do in this is work out that ratio, take 1.25 off it and
|
||||
# save the absolute value when we sort by this value, the smallest
|
||||
# one is most likely to be the cover image, hopefully.
|
||||
r = {}
|
||||
for img in soup('img'):
|
||||
try:
|
||||
r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
|
||||
except KeyError:
|
||||
# interestingly, occasionally the only image without height
|
||||
# or width attrs is the cover...
|
||||
r[0] = img['src']
|
||||
l = r.keys()
|
||||
l.sort()
|
||||
ans = r[l[0]]
|
||||
# this link comes from the internal html, which is in a subdir
|
||||
if ans is not None:
|
||||
try:
|
||||
ans = rdr.GetFile(ans)
|
||||
except:
|
||||
ans = rdr.root + "/" + ans
|
||||
try:
|
||||
ans = rdr.GetFile(ans)
|
||||
except:
|
||||
ans = None
|
||||
if ans is not None:
|
||||
from PIL import Image
|
||||
from cStringIO import StringIO
|
||||
buf = StringIO()
|
||||
try:
|
||||
Image.open(StringIO(ans)).convert('RGB').save(buf, 'JPEG')
|
||||
ans = buf.getvalue()
|
||||
except:
|
||||
ans = None
|
||||
return ans
|
||||
|
||||
|
||||
def get_metadata_from_reader(rdr):
|
||||
raw = rdr.GetFile(rdr.home)
|
||||
home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0])
|
||||
|
||||
title = rdr.title
|
||||
authors = _get_authors(home)
|
||||
mi = MetaInformation(title, authors)
|
||||
publisher = _get_publisher(home)
|
||||
if publisher:
|
||||
mi.publisher = publisher
|
||||
isbn = _get_isbn(home)
|
||||
if isbn:
|
||||
mi.isbn = isbn
|
||||
comments = _get_comments(home)
|
||||
if comments:
|
||||
mi.comments = comments
|
||||
|
||||
cdata = _get_cover(home, rdr)
|
||||
if cdata is not None:
|
||||
mi.cover_data = ('jpg', cdata)
|
||||
|
||||
return mi
|
||||
|
||||
def get_metadata(stream):
|
||||
with TemporaryFile('_chm_metadata.chm') as fname:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(stream.read())
|
||||
from calibre.ebooks.chm.reader import CHMReader
|
||||
rdr = CHMReader(fname, default_log)
|
||||
return get_metadata_from_reader(rdr)
|
212
src/calibre/ebooks/chm/reader.py
Normal file
212
src/calibre/ebooks/chm/reader.py
Normal file
@ -0,0 +1,212 @@
|
||||
from __future__ import with_statement
|
||||
''' CHM File decoding support '''
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
||||
' and Alex Bramley <a.bramley at gmail.com>.'
|
||||
|
||||
import os, re
|
||||
from mimetypes import guess_type as guess_mimetype
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, NavigableString
|
||||
|
||||
from calibre.utils.chm.chm import CHMFile
|
||||
from calibre.utils.chm.chmlib import (
|
||||
CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
|
||||
chm_enumerate,
|
||||
)
|
||||
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
|
||||
|
||||
def match_string(s1, s2_already_lowered):
|
||||
if s1 is not None and s2_already_lowered is not None:
|
||||
if s1.lower()==s2_already_lowered:
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_all_prev_empty(tag):
|
||||
if tag is None:
|
||||
return True
|
||||
if tag.__class__ == NavigableString and not check_empty(tag):
|
||||
return False
|
||||
return check_all_prev_empty(tag.previousSibling)
|
||||
|
||||
def check_empty(s, rex = re.compile(r'\S')):
|
||||
return rex.search(s) is None
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(usage=_('%prog [options] mybook.chm'))
|
||||
parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
|
||||
parser.add_option('--verbose', default=False, action='store_true', dest='verbose')
|
||||
parser.add_option("-t", "--title", action="store", type="string", \
|
||||
dest="title", help=_("Set the book title"))
|
||||
parser.add_option('--title-sort', action='store', type='string', default=None,
|
||||
dest='title_sort', help=_('Set sort key for the title'))
|
||||
parser.add_option("-a", "--author", action="store", type="string", \
|
||||
dest="author", help=_("Set the author"))
|
||||
parser.add_option('--author-sort', action='store', type='string', default=None,
|
||||
dest='author_sort', help=_('Set sort key for the author'))
|
||||
parser.add_option("-c", "--category", action="store", type="string", \
|
||||
dest="category", help=_("The category this book belongs"
|
||||
" to. E.g.: History"))
|
||||
parser.add_option("--thumbnail", action="store", type="string", \
|
||||
dest="thumbnail", help=_("Path to a graphic that will be"
|
||||
" set as this files' thumbnail"))
|
||||
parser.add_option("--comment", action="store", type="string", \
|
||||
dest="freetext", help=_("Path to a txt file containing a comment."))
|
||||
parser.add_option("--get-thumbnail", action="store_true", \
|
||||
dest="get_thumbnail", default=False, \
|
||||
help=_("Extract thumbnail from LRF file"))
|
||||
parser.add_option('--publisher', default=None, help=_('Set the publisher'))
|
||||
parser.add_option('--classification', default=None, help=_('Set the book classification'))
|
||||
parser.add_option('--creator', default=None, help=_('Set the book creator'))
|
||||
parser.add_option('--producer', default=None, help=_('Set the book producer'))
|
||||
parser.add_option('--get-cover', action='store_true', default=False,
|
||||
help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.'))
|
||||
parser.add_option('--bookid', action='store', type='string', default=None,
|
||||
dest='book_id', help=_('Set book ID'))
|
||||
parser.add_option('--font-delta', action='store', type='int', default=0,
|
||||
dest='font_delta', help=_('Set font delta'))
|
||||
return parser
|
||||
|
||||
class CHMError(Exception):
|
||||
pass
|
||||
|
||||
class CHMReader(CHMFile):
|
||||
def __init__(self, input, log):
|
||||
CHMFile.__init__(self)
|
||||
if not self.LoadCHM(input):
|
||||
raise CHMError("Unable to open CHM file '%s'"%(input,))
|
||||
self.log = log
|
||||
self._sourcechm = input
|
||||
self._contents = None
|
||||
self._playorder = 0
|
||||
self._metadata = False
|
||||
self._extracted = False
|
||||
|
||||
# location of '.hhc' file, which is the CHM TOC.
|
||||
self.root, ext = os.path.splitext(self.topics.lstrip('/'))
|
||||
self.hhc_path = self.root + ".hhc"
|
||||
|
||||
|
||||
def _parse_toc(self, ul, basedir=os.getcwdu()):
|
||||
toc = TOC(play_order=self._playorder, base_path=basedir, text='')
|
||||
self._playorder += 1
|
||||
for li in ul('li', recursive=False):
|
||||
href = li.object('param', {'name': 'Local'})[0]['value']
|
||||
if href.count('#'):
|
||||
href, frag = href.split('#')
|
||||
else:
|
||||
frag = None
|
||||
name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
|
||||
#print "========>", name
|
||||
toc.add_item(href, frag, name, play_order=self._playorder)
|
||||
self._playorder += 1
|
||||
if li.ul:
|
||||
child = self._parse_toc(li.ul)
|
||||
child.parent = toc
|
||||
toc.append(child)
|
||||
#print toc
|
||||
return toc
|
||||
|
||||
|
||||
def GetFile(self, path):
|
||||
# have to have abs paths for ResolveObject, but Contents() deliberately
|
||||
# makes them relative. So we don't have to worry, re-add the leading /.
|
||||
# note this path refers to the internal CHM structure
|
||||
if path[0] != '/':
|
||||
path = '/' + path
|
||||
res, ui = self.ResolveObject(path)
|
||||
if res != CHM_RESOLVE_SUCCESS:
|
||||
raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
|
||||
size, data = self.RetrieveObject(ui)
|
||||
if size == 0:
|
||||
raise CHMError("'%s' is zero bytes in length!"%(path,))
|
||||
return data
|
||||
|
||||
def ExtractFiles(self, output_dir=os.getcwdu()):
|
||||
for path in self.Contents():
|
||||
lpath = os.path.join(output_dir, path)
|
||||
self._ensure_dir(lpath)
|
||||
data = self.GetFile(path)
|
||||
with open(lpath, 'wb') as f:
|
||||
if guess_mimetype(path)[0] == ('text/html'):
|
||||
data = self._reformat(data)
|
||||
f.write(data)
|
||||
self._extracted = True
|
||||
files = os.listdir(output_dir)
|
||||
if self.hhc_path not in files:
|
||||
for f in files:
|
||||
if f.lower() == self.hhc_path.lower():
|
||||
self.hhc_path = f
|
||||
break
|
||||
|
||||
def _reformat(self, data):
|
||||
try:
|
||||
soup = BeautifulSoup(data)
|
||||
except UnicodeEncodeError:
|
||||
# hit some strange encoding problems...
|
||||
print "Unable to parse html for cleaning, leaving it :("
|
||||
return data
|
||||
# nuke javascript...
|
||||
[s.extract() for s in soup('script')]
|
||||
# remove forward and back nav bars from the top/bottom of each page
|
||||
# cos they really fuck with the flow of things and generally waste space
|
||||
# since we can't use [a,b] syntax to select arbitrary items from a list
|
||||
# we'll have to do this manually...
|
||||
t = soup('table')
|
||||
if t:
|
||||
if (t[0].previousSibling is None
|
||||
or t[0].previousSibling.previousSibling is None):
|
||||
t[0].extract()
|
||||
if (t[-1].nextSibling is None
|
||||
or t[-1].nextSibling.nextSibling is None):
|
||||
t[-1].extract()
|
||||
# for some very odd reason each page's content appears to be in a table
|
||||
# too. and this table has sub-tables for random asides... grr.
|
||||
|
||||
# remove br at top of page if present after nav bars removed
|
||||
br = soup('br')
|
||||
if br:
|
||||
if check_all_prev_empty(br[0].previousSibling):
|
||||
br[0].extract()
|
||||
|
||||
# some images seem to be broken in some chm's :/
|
||||
for img in soup('img'):
|
||||
try:
|
||||
# some are supposedly "relative"... lies.
|
||||
while img['src'].startswith('../'): img['src'] = img['src'][3:]
|
||||
# some have ";<junk>" at the end.
|
||||
img['src'] = img['src'].split(';')[0]
|
||||
except KeyError:
|
||||
# and some don't even have a src= ?!
|
||||
pass
|
||||
# now give back some pretty html.
|
||||
return soup.prettify()
|
||||
|
||||
def Contents(self):
|
||||
if self._contents is not None:
|
||||
return self._contents
|
||||
paths = []
|
||||
def get_paths(chm, ui, ctx):
|
||||
# skip directories
|
||||
# note this path refers to the internal CHM structure
|
||||
if ui.path[-1] != '/':
|
||||
# and make paths relative
|
||||
paths.append(ui.path.lstrip('/'))
|
||||
chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
|
||||
self._contents = paths
|
||||
return self._contents
|
||||
|
||||
def _ensure_dir(self, path):
|
||||
dir = os.path.dirname(path)
|
||||
if not os.path.isdir(dir):
|
||||
os.makedirs(dir)
|
||||
|
||||
def extract_content(self, output_dir=os.getcwdu()):
|
||||
self.ExtractFiles(output_dir=output_dir)
|
||||
|
||||
|
||||
|
@ -13,6 +13,7 @@ from calibre.customize.ui import input_profiles, output_profiles, \
|
||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.utils.date import parse_date
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre import extract, walk
|
||||
|
||||
DEBUG_README=u'''
|
||||
@ -726,6 +727,13 @@ OptionRecommendation(name='timestamp',
|
||||
else:
|
||||
os.makedirs(out_dir)
|
||||
self.dump_oeb(ret, out_dir)
|
||||
if self.input_fmt == 'recipe':
|
||||
zf = ZipFile(os.path.join(self.opts.debug_pipeline,
|
||||
'periodical.downloaded_recipe'), 'w')
|
||||
zf.add_dir(out_dir)
|
||||
with self.input_plugin:
|
||||
self.input_plugin.save_download(zf)
|
||||
zf.close()
|
||||
|
||||
self.log.info('Input debug saved to:', out_dir)
|
||||
|
||||
@ -773,6 +781,7 @@ OptionRecommendation(name='timestamp',
|
||||
self.ui_reporter(0.01, _('Converting input to HTML...'))
|
||||
ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)
|
||||
self.input_plugin.report_progress = ir
|
||||
with self.input_plugin:
|
||||
self.oeb = self.input_plugin(stream, self.opts,
|
||||
self.input_fmt, self.log,
|
||||
accelerators, tdir)
|
||||
@ -780,7 +789,7 @@ OptionRecommendation(name='timestamp',
|
||||
self.dump_input(self.oeb, tdir)
|
||||
if self.abort_after_input_dump:
|
||||
return
|
||||
if self.input_fmt == 'recipe':
|
||||
if self.input_fmt in ('recipe', 'downloaded_recipe'):
|
||||
self.opts_to_mi(self.user_metadata)
|
||||
if not hasattr(self.oeb, 'manifest'):
|
||||
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
|
||||
@ -793,6 +802,8 @@ OptionRecommendation(name='timestamp',
|
||||
out_dir = os.path.join(self.opts.debug_pipeline, 'parsed')
|
||||
self.dump_oeb(self.oeb, out_dir)
|
||||
self.log('Parsed HTML written to:', out_dir)
|
||||
self.input_plugin.specialize(self.oeb, self.opts, self.log,
|
||||
self.output_fmt)
|
||||
|
||||
pr(0., _('Running transforms on ebook...'))
|
||||
|
||||
@ -882,6 +893,7 @@ OptionRecommendation(name='timestamp',
|
||||
our = CompositeProgressReporter(0.67, 1., self.ui_reporter)
|
||||
self.output_plugin.report_progress = our
|
||||
our(0., _('Creating')+' %s'%self.output_plugin.name)
|
||||
with self.output_plugin:
|
||||
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
|
||||
self.opts, self.log)
|
||||
self.ui_reporter(1.)
|
||||
|
@ -3,7 +3,7 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, uuid
|
||||
import os, uuid
|
||||
from itertools import cycle
|
||||
|
||||
from lxml import etree
|
||||
@ -19,8 +19,7 @@ class EPUBInput(InputFormatPlugin):
|
||||
|
||||
recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)])
|
||||
|
||||
@classmethod
|
||||
def decrypt_font(cls, key, path):
|
||||
def decrypt_font(self, key, path):
|
||||
raw = open(path, 'rb').read()
|
||||
crypt = raw[:1024]
|
||||
key = cycle(iter(key))
|
||||
@ -29,13 +28,18 @@ class EPUBInput(InputFormatPlugin):
|
||||
f.write(decrypt)
|
||||
f.write(raw[1024:])
|
||||
|
||||
@classmethod
|
||||
def process_encryption(cls, encfile, opf, log):
|
||||
def process_encryption(self, encfile, opf, log):
|
||||
key = None
|
||||
m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read())
|
||||
if m:
|
||||
key = m.group(1)
|
||||
for item in opf.identifier_iter():
|
||||
scheme = None
|
||||
for key in item.attrib.keys():
|
||||
if key.endswith('scheme'):
|
||||
scheme = item.get(key)
|
||||
if (scheme and scheme.lower() == 'uuid') or \
|
||||
(item.text and item.text.startswith('urn:uuid:')):
|
||||
key = str(item.text).rpartition(':')[-1]
|
||||
key = list(map(ord, uuid.UUID(key).bytes))
|
||||
|
||||
try:
|
||||
root = etree.parse(encfile)
|
||||
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
|
||||
@ -46,7 +50,8 @@ class EPUBInput(InputFormatPlugin):
|
||||
uri = cr.get('URI')
|
||||
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
|
||||
if os.path.exists(path):
|
||||
cls.decrypt_font(key, path)
|
||||
self._encrypted_font_uris.append(uri)
|
||||
self.decrypt_font(key, path)
|
||||
return True
|
||||
except:
|
||||
import traceback
|
||||
@ -115,14 +120,17 @@ class EPUBInput(InputFormatPlugin):
|
||||
if opf is None:
|
||||
raise ValueError('%s is not a valid EPUB file'%path)
|
||||
|
||||
if os.path.exists(encfile):
|
||||
if not self.process_encryption(encfile, opf, log):
|
||||
raise DRMError(os.path.basename(path))
|
||||
|
||||
opf = os.path.relpath(opf, os.getcwdu())
|
||||
parts = os.path.split(opf)
|
||||
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
|
||||
|
||||
self._encrypted_font_uris = []
|
||||
if os.path.exists(encfile):
|
||||
if not self.process_encryption(encfile, opf, log):
|
||||
raise DRMError(os.path.basename(path))
|
||||
self.encrypted_fonts = self._encrypted_font_uris
|
||||
|
||||
|
||||
if len(parts) > 1 and parts[0]:
|
||||
delta = '/'.join(parts[:-1])+'/'
|
||||
for elem in opf.itermanifest():
|
||||
|
@ -12,8 +12,9 @@ from urllib import unquote
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre import strftime, guess_type, prepare_string_for_xml
|
||||
from calibre import strftime, guess_type, prepare_string_for_xml, CurrentDir
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre.constants import filesystem_encoding
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@ -157,11 +158,9 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
|
||||
self.workaround_ade_quirks()
|
||||
self.workaround_webkit_quirks()
|
||||
self.workaround_sony_quirks()
|
||||
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
|
||||
RescaleImages()(oeb, opts)
|
||||
|
||||
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
split = Split(not self.opts.dont_split_on_page_breaks,
|
||||
max_flow_size=self.opts.flow_size*1024
|
||||
@ -170,6 +169,21 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
|
||||
self.insert_cover()
|
||||
|
||||
self.workaround_sony_quirks()
|
||||
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
identifiers = oeb.metadata['identifier']
|
||||
uuid = None
|
||||
for x in identifiers:
|
||||
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode(x).startswith('urn:uuid:'):
|
||||
uuid = unicode(x).split(':')[-1]
|
||||
break
|
||||
if uuid is None:
|
||||
self.log.warn('No UUID identifier found')
|
||||
from uuid import uuid4
|
||||
uuid = str(uuid4())
|
||||
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
|
||||
|
||||
with TemporaryDirectory('_epub_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
@ -177,10 +191,16 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\
|
||||
if x.endswith('.ncx')][0])
|
||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||
encryption = None
|
||||
if encrypted_fonts:
|
||||
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
|
||||
|
||||
from calibre.ebooks.epub import initialize_container
|
||||
epub = initialize_container(output_path, os.path.basename(opf))
|
||||
epub.add_dir(tdir)
|
||||
if encryption is not None:
|
||||
epub.writestr('META-INF/encryption.xml', encryption)
|
||||
if opts.extract_to is not None:
|
||||
if os.path.exists(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
@ -189,6 +209,52 @@ class EPUBOutput(OutputFormatPlugin):
|
||||
self.log.info('EPUB extracted to', opts.extract_to)
|
||||
epub.close()
|
||||
|
||||
def encrypt_fonts(self, uris, tdir, uuid):
|
||||
from binascii import unhexlify
|
||||
|
||||
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
|
||||
if len(key) < 16:
|
||||
raise ValueError('UUID identifier %r is invalid'%uuid)
|
||||
key = unhexlify((key + key)[:32])
|
||||
key = tuple(map(ord, key))
|
||||
paths = []
|
||||
with CurrentDir(tdir):
|
||||
paths = [os.path.join(*x.split('/')) for x in uris]
|
||||
uris = dict(zip(uris, paths))
|
||||
fonts = []
|
||||
for uri in list(uris.keys()):
|
||||
path = uris[uri]
|
||||
if isinstance(path, unicode):
|
||||
path = path.encode(filesystem_encoding)
|
||||
if not os.path.exists(path):
|
||||
uris.pop(uri)
|
||||
continue
|
||||
self.log.debug('Encrypting font:', uri)
|
||||
with open(path, 'r+b') as f:
|
||||
data = f.read(1024)
|
||||
f.seek(0)
|
||||
for i in range(1024):
|
||||
f.write(chr(ord(data[i]) ^ key[i%16]))
|
||||
if not isinstance(uri, unicode):
|
||||
uri = uri.decode('utf-8')
|
||||
fonts.append(u'''
|
||||
<enc:EncryptedData>
|
||||
<enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
|
||||
<enc:CipherData>
|
||||
<enc:CipherReference URI="%s"/>
|
||||
</enc:CipherData>
|
||||
</enc:EncryptedData>
|
||||
'''%(uri.replace('"', '\\"')))
|
||||
if fonts:
|
||||
ans = '''<encryption
|
||||
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
|
||||
xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
|
||||
xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
|
||||
'''
|
||||
ans += (u'\n'.join(fonts)).encode('utf-8')
|
||||
ans += '\n</encryption>'
|
||||
return ans
|
||||
|
||||
def default_cover(self):
|
||||
'''
|
||||
Create a generic cover for books that dont have a cover
|
||||
|
@ -20,7 +20,7 @@ from itertools import izip
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre.constants import islinux
|
||||
from calibre.constants import islinux, isfreebsd
|
||||
from calibre import unicode_path
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
@ -346,7 +346,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
self.added_resources = {}
|
||||
self.log = log
|
||||
for path, href in htmlfile_map.items():
|
||||
if not islinux:
|
||||
if not (islinux or isfreebsd):
|
||||
path = path.lower()
|
||||
self.added_resources[path] = href
|
||||
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
|
||||
@ -417,7 +417,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
if os.path.isdir(link):
|
||||
self.log.warn(link_, 'is a link to a directory. Ignoring.')
|
||||
return link_
|
||||
if not islinux:
|
||||
if not (islinux or isfreebsd):
|
||||
link = link.lower()
|
||||
if link not in self.added_resources:
|
||||
bhref = os.path.basename(link)
|
||||
|
@ -215,6 +215,28 @@ def merge_results(one, two):
|
||||
else:
|
||||
one[idx].smart_update(x)
|
||||
|
||||
class MetadataSources(object):
|
||||
|
||||
def __init__(self, sources):
|
||||
self.sources = sources
|
||||
|
||||
def __enter__(self):
|
||||
for s in self.sources:
|
||||
s.__enter__()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
for s in self.sources:
|
||||
s.__exit__()
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
for s in self.sources:
|
||||
s(*args, **kwargs)
|
||||
|
||||
def join(self):
|
||||
for s in self.sources:
|
||||
s.join()
|
||||
|
||||
def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
|
||||
verbose=0):
|
||||
assert not(title is None and author is None and publisher is None and \
|
||||
@ -224,11 +246,10 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
|
||||
if isbn is not None:
|
||||
isbn = re.sub(r'[^a-zA-Z0-9]', '', isbn).upper()
|
||||
fetchers = list(metadata_sources(isbndb_key=isbndb_key))
|
||||
with MetadataSources(fetchers) as manager:
|
||||
manager(title, author, publisher, isbn, verbose)
|
||||
manager.join()
|
||||
|
||||
for fetcher in fetchers:
|
||||
fetcher(title, author, publisher, isbn, verbose)
|
||||
for fetcher in fetchers:
|
||||
fetcher.join()
|
||||
results = list(fetchers[0].results)
|
||||
for fetcher in fetchers[1:]:
|
||||
merge_results(results, fetcher.results)
|
||||
@ -243,10 +264,9 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
|
||||
def get_social_metadata(mi, verbose=0):
|
||||
from calibre.customize.ui import metadata_sources
|
||||
fetchers = list(metadata_sources(metadata_type='social'))
|
||||
for fetcher in fetchers:
|
||||
fetcher(mi.title, mi.authors, mi.publisher, mi.isbn, verbose)
|
||||
for fetcher in fetchers:
|
||||
fetcher.join()
|
||||
with MetadataSources(fetchers) as manager:
|
||||
manager(mi.title, mi.authors, mi.publisher, mi.isbn, verbose)
|
||||
manager.join()
|
||||
ratings, tags, comments = [], set([]), set([])
|
||||
for fetcher in fetchers:
|
||||
if fetcher.results:
|
||||
|
@ -70,6 +70,17 @@ def is_recipe(filename):
|
||||
filename.rpartition('.')[0].endswith('_recipe_out')
|
||||
|
||||
def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
|
||||
pos = 0
|
||||
if hasattr(stream, 'tell'):
|
||||
pos = stream.tell()
|
||||
try:
|
||||
return _get_metadata(stream, stream_type, use_libprs_metadata)
|
||||
finally:
|
||||
if hasattr(stream, 'seek'):
|
||||
stream.seek(pos)
|
||||
|
||||
|
||||
def _get_metadata(stream, stream_type, use_libprs_metadata):
|
||||
if stream_type: stream_type = stream_type.lower()
|
||||
if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'):
|
||||
stream_type = 'html'
|
||||
|
@ -97,9 +97,14 @@ class MetadataUpdater(object):
|
||||
|
||||
self.nrecs, = unpack('>H', data[76:78])
|
||||
record0 = self.record0 = self.record(0)
|
||||
mobi_header_length, = unpack('>I', record0[0x14:0x18])
|
||||
if not mobi_header_length:
|
||||
raise MobiError("Non-standard file format. Try 'Convert E-Books' with MOBI as Input and Output formats.")
|
||||
|
||||
self.encryption_type, = unpack('>H', record0[12:14])
|
||||
codepage, = unpack('>I', record0[28:32])
|
||||
self.codec = 'utf-8' if codepage == 65001 else 'cp1252'
|
||||
|
||||
image_base, = unpack('>I', record0[108:112])
|
||||
flags, = self.flags, = unpack('>I', record0[128:132])
|
||||
have_exth = self.have_exth = (flags & 0x40) != 0
|
||||
@ -306,9 +311,10 @@ class MetadataUpdater(object):
|
||||
return StreamSlicer(self.stream, start, stop)
|
||||
|
||||
def update(self, mi):
|
||||
def pop_exth_record(exth_id):
|
||||
if exth_id in self.original_exth_records:
|
||||
self.original_exth_records.pop(exth_id)
|
||||
def update_exth_record(rec):
|
||||
recs.append(rec)
|
||||
if rec[0] in self.original_exth_records:
|
||||
self.original_exth_records.pop(rec[0])
|
||||
|
||||
if self.type != "BOOKMOBI":
|
||||
raise MobiError("Setting metadata only supported for MOBI files of type 'BOOK'.\n"
|
||||
@ -323,47 +329,36 @@ class MetadataUpdater(object):
|
||||
pas = False
|
||||
if mi.author_sort and pas:
|
||||
authors = mi.author_sort
|
||||
recs.append((100, authors.encode(self.codec, 'replace')))
|
||||
pop_exth_record(100)
|
||||
update_exth_record((100, authors.encode(self.codec, 'replace')))
|
||||
elif mi.authors:
|
||||
authors = '; '.join(mi.authors)
|
||||
recs.append((100, authors.encode(self.codec, 'replace')))
|
||||
pop_exth_record(100)
|
||||
update_exth_record((100, authors.encode(self.codec, 'replace')))
|
||||
if mi.publisher:
|
||||
recs.append((101, mi.publisher.encode(self.codec, 'replace')))
|
||||
pop_exth_record(101)
|
||||
update_exth_record((101, mi.publisher.encode(self.codec, 'replace')))
|
||||
if mi.comments:
|
||||
recs.append((103, mi.comments.encode(self.codec, 'replace')))
|
||||
pop_exth_record(103)
|
||||
update_exth_record((103, mi.comments.encode(self.codec, 'replace')))
|
||||
if mi.isbn:
|
||||
recs.append((104, mi.isbn.encode(self.codec, 'replace')))
|
||||
pop_exth_record(104)
|
||||
update_exth_record((104, mi.isbn.encode(self.codec, 'replace')))
|
||||
if mi.tags:
|
||||
subjects = '; '.join(mi.tags)
|
||||
recs.append((105, subjects.encode(self.codec, 'replace')))
|
||||
pop_exth_record(105)
|
||||
update_exth_record((105, subjects.encode(self.codec, 'replace')))
|
||||
if mi.pubdate:
|
||||
recs.append((106, str(mi.pubdate).encode(self.codec, 'replace')))
|
||||
pop_exth_record(106)
|
||||
update_exth_record((106, str(mi.pubdate).encode(self.codec, 'replace')))
|
||||
elif mi.timestamp:
|
||||
recs.append((106, str(mi.timestamp).encode(self.codec, 'replace')))
|
||||
pop_exth_record(106)
|
||||
update_exth_record((106, str(mi.timestamp).encode(self.codec, 'replace')))
|
||||
elif self.timestamp:
|
||||
recs.append((106, self.timestamp))
|
||||
pop_exth_record(106)
|
||||
update_exth_record((106, self.timestamp))
|
||||
else:
|
||||
recs.append((106, nowf().isoformat().encode(self.codec, 'replace')))
|
||||
pop_exth_record(106)
|
||||
update_exth_record((106, nowf().isoformat().encode(self.codec, 'replace')))
|
||||
if self.cover_record is not None:
|
||||
recs.append((201, pack('>I', self.cover_rindex)))
|
||||
recs.append((203, pack('>I', 0)))
|
||||
pop_exth_record(201)
|
||||
pop_exth_record(203)
|
||||
update_exth_record((201, pack('>I', self.cover_rindex)))
|
||||
update_exth_record((203, pack('>I', 0)))
|
||||
if self.thumbnail_record is not None:
|
||||
recs.append((202, pack('>I', self.thumbnail_rindex)))
|
||||
pop_exth_record(202)
|
||||
update_exth_record((202, pack('>I', self.thumbnail_rindex)))
|
||||
if 503 in self.original_exth_records:
|
||||
update_exth_record((503, mi.title.encode(self.codec, 'replace')))
|
||||
|
||||
# Restore any original EXTH fields that weren't updated
|
||||
# Include remaining original EXTH fields
|
||||
for id in sorted(self.original_exth_records):
|
||||
recs.append((id, self.original_exth_records[id]))
|
||||
recs = sorted(recs, key=lambda x:(x[0],x[0]))
|
||||
|
@ -779,6 +779,9 @@ class OPF(object):
|
||||
self.set_text(matches[0], unicode(val))
|
||||
return property(fget=fget, fset=fset)
|
||||
|
||||
def identifier_iter(self):
|
||||
for item in self.identifier_path(self.metadata):
|
||||
yield item
|
||||
|
||||
def guess_cover(self):
|
||||
'''
|
||||
|
@ -8,9 +8,10 @@ Read metadata from RAR archives
|
||||
'''
|
||||
|
||||
import os
|
||||
from cStringIO import StringIO
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
from calibre.ptempfile import PersistentTemporaryFile, TemporaryDirectory
|
||||
from calibre.libunrar import extract_member, names
|
||||
from calibre import CurrentDir
|
||||
|
||||
def get_metadata(stream):
|
||||
from calibre.ebooks.metadata.archive import is_comic
|
||||
@ -32,8 +33,10 @@ def get_metadata(stream):
|
||||
stream_type = stream_type[1:]
|
||||
if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
|
||||
'rb', 'imp', 'pdf', 'lrf'):
|
||||
data = extract_member(path, match=None, name=f)[1]
|
||||
stream = StringIO(data)
|
||||
with TemporaryDirectory() as tdir:
|
||||
with CurrentDir(tdir):
|
||||
stream = extract_member(path, match=None, name=f,
|
||||
as_file=True)[1]
|
||||
return get_metadata(stream, stream_type)
|
||||
raise ValueError('No ebook found in RAR archive')
|
||||
|
||||
|
@ -149,7 +149,8 @@ class TOC(list):
|
||||
|
||||
def read_ncx_toc(self, toc):
|
||||
self.base_path = os.path.dirname(toc)
|
||||
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
|
||||
raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0]
|
||||
soup = NCXSoup(raw)
|
||||
|
||||
def process_navpoint(np, dest):
|
||||
play_order = np.get('playOrder', None)
|
||||
@ -160,7 +161,7 @@ class TOC(list):
|
||||
if nl is not None:
|
||||
text = u''
|
||||
for txt in nl.findAll(re.compile('text')):
|
||||
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||
text += u''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||
content = np.find(re.compile('content'))
|
||||
if content is None or not content.has_key('src') or not txt:
|
||||
return
|
||||
|
@ -43,6 +43,8 @@ def read_metadata_(task, tdir, notification=lambda x,y:x):
|
||||
import_map = {}
|
||||
for format in formats:
|
||||
nfp = run_plugins_on_import(format)
|
||||
if nfp is None:
|
||||
nfp = format
|
||||
nfp = os.path.abspath(nfp)
|
||||
if isinstance(nfp, unicode):
|
||||
nfp.encode(filesystem_encoding)
|
||||
|
@ -3,9 +3,10 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os
|
||||
from zipfile import ZipFile
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre import CurrentDir
|
||||
|
||||
def get_metadata(stream):
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
@ -23,8 +24,10 @@ def get_metadata(stream):
|
||||
stream_type = stream_type[1:]
|
||||
if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
|
||||
'rb', 'imp', 'pdf', 'lrf'):
|
||||
stream = StringIO(zf.read(f))
|
||||
return get_metadata(stream, stream_type)
|
||||
with TemporaryDirectory() as tdir:
|
||||
with CurrentDir(tdir):
|
||||
path = zf.extract(f)
|
||||
return get_metadata(open(path, 'rb'), stream_type)
|
||||
raise ValueError('No ebook found in ZIP archive')
|
||||
|
||||
|
||||
|
@ -154,7 +154,7 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
MobiWriter, PALMDOC, UNCOMPRESSED
|
||||
from calibre.ebooks.mobi.mobiml import MobiMLizer
|
||||
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
|
||||
@ -163,8 +163,11 @@ class MOBIOutput(OutputFormatPlugin):
|
||||
tocadder(oeb, opts)
|
||||
mangler = CaseMangler()
|
||||
mangler(oeb, opts)
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb, opts)
|
||||
except Unavailable:
|
||||
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
|
||||
mobimlizer(oeb, opts)
|
||||
self.check_for_periodical()
|
||||
|
@ -4,12 +4,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
Read data from .mobi files
|
||||
'''
|
||||
|
||||
import functools
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import textwrap
|
||||
import cStringIO
|
||||
import functools, shutil, os, re, struct, textwrap, cStringIO, sys
|
||||
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
@ -619,6 +614,16 @@ class MobiReader(object):
|
||||
* opf.cover.split('/'))):
|
||||
opf.cover = None
|
||||
|
||||
cover = opf.cover
|
||||
if cover is not None:
|
||||
cover = cover.replace('/', os.sep)
|
||||
if os.path.exists(cover):
|
||||
ncover = 'images'+os.sep+'calibre_cover.jpg'
|
||||
if os.path.exists(ncover):
|
||||
os.remove(ncover)
|
||||
shutil.copyfile(cover, ncover)
|
||||
opf.cover = ncover.replace(os.sep, '/')
|
||||
|
||||
manifest = [(htmlfile, 'application/xhtml+xml'),
|
||||
(os.path.abspath('styles.css'), 'text/css')]
|
||||
bp = os.path.dirname(htmlfile)
|
||||
@ -796,15 +801,22 @@ class MobiReader(object):
|
||||
def get_metadata(stream):
|
||||
from calibre.utils.logging import Log
|
||||
log = Log()
|
||||
|
||||
mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')])
|
||||
try:
|
||||
mh = MetadataHeader(stream, log)
|
||||
if mh.title and mh.title != _('Unknown'):
|
||||
mi.title = mh.title
|
||||
|
||||
if mh.exth is not None:
|
||||
if mh.exth.mi is not None:
|
||||
mi = mh.exth.mi
|
||||
else:
|
||||
size = sys.maxint
|
||||
if hasattr(stream, 'seek') and hasattr(stream, 'tell'):
|
||||
pos = stream.tell()
|
||||
stream.seek(0, 2)
|
||||
size = stream.tell()
|
||||
stream.seek(pos)
|
||||
if size < 4*1024*1024:
|
||||
with TemporaryDirectory('_mobi_meta_reader') as tdir:
|
||||
with CurrentDir(tdir):
|
||||
mr = MobiReader(stream, log)
|
||||
@ -818,10 +830,12 @@ def get_metadata(stream):
|
||||
else:
|
||||
data = mh.section_data(mh.first_image_index)
|
||||
buf = cStringIO.StringIO(data)
|
||||
try:
|
||||
im = PILImage.open(buf)
|
||||
obuf = cStringIO.StringIO()
|
||||
im.convert('RGBA').save(obuf, format='JPEG')
|
||||
mi.cover_data = ('jpg', obuf.getvalue())
|
||||
except:
|
||||
log.exception()
|
||||
log.exception('Failed to read MOBI cover')
|
||||
else:
|
||||
obuf = cStringIO.StringIO()
|
||||
im.convert('RGB').save(obuf, format='JPEG')
|
||||
mi.cover_data = ('jpg', obuf.getvalue())
|
||||
return mi
|
||||
|
@ -152,13 +152,17 @@ class EbookIterator(object):
|
||||
prints('Substituting font family: %s -> %s'%(bad, good))
|
||||
return match.group().replace(bad, '"%s"'%good)
|
||||
|
||||
from calibre.ebooks.chardet import force_encoding
|
||||
for csspath in css_files:
|
||||
with open(csspath, 'r+b') as f:
|
||||
css = f.read()
|
||||
css = font_family_pat.sub(prepend_embedded_font, css)
|
||||
enc = force_encoding(css, False)
|
||||
css = css.decode(enc, 'replace')
|
||||
ncss = font_family_pat.sub(prepend_embedded_font, css)
|
||||
if ncss != css:
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(css)
|
||||
f.write(ncss.encode(enc))
|
||||
|
||||
def __enter__(self, processed=False):
|
||||
self.delete_on_exit = []
|
||||
@ -173,11 +177,12 @@ class EbookIterator(object):
|
||||
plumber.opts.no_process = True
|
||||
|
||||
plumber.input_plugin.for_viewer = True
|
||||
with plumber.input_plugin:
|
||||
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
|
||||
plumber.opts, plumber.input_fmt, self.log,
|
||||
{}, self.base)
|
||||
|
||||
if processed or plumber.input_fmt.lower() in ('pdf', 'rb') and \
|
||||
if processed or plumber.input_fmt.lower() in ('pdb', 'pdf', 'rb') and \
|
||||
not hasattr(self.pathtoopf, 'manifest'):
|
||||
self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
|
||||
plumber.input_plugin)
|
||||
|
@ -331,7 +331,10 @@ class OEBReader(object):
|
||||
id = child.get('id')
|
||||
klass = child.get('class', 'chapter')
|
||||
|
||||
try:
|
||||
po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
|
||||
except:
|
||||
po = self.oeb.toc.next_play_order()
|
||||
|
||||
authorElement = xpath(child,
|
||||
'descendant::calibre:meta[@name = "author"]')
|
||||
|
@ -190,11 +190,11 @@ class Stylizer(object):
|
||||
selector = CSSSelector(ntext)
|
||||
matches = selector(tree)
|
||||
|
||||
if not matches and class_sel_pat.match(text):
|
||||
if not matches and class_sel_pat.match(text) and text.lower() != text:
|
||||
found = False
|
||||
ltext = text.lower()
|
||||
for x in tree.xpath('//*[@class]'):
|
||||
if text.lower().endswith('.'+x.get('class').lower()) and \
|
||||
text.lower() != text:
|
||||
if ltext.endswith('.'+x.get('class').lower()):
|
||||
matches.append(x)
|
||||
found = True
|
||||
if found:
|
||||
|
@ -27,11 +27,14 @@ from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
IMAGE_TAGS = set([XHTML('img'), XHTML('object')])
|
||||
KEEP_ATTRS = set(['class', 'style', 'width', 'height', 'align'])
|
||||
|
||||
class Unavailable(Exception):
|
||||
pass
|
||||
|
||||
class SVGRasterizer(object):
|
||||
def __init__(self):
|
||||
from calibre.gui2 import is_ok_to_use_qt
|
||||
if not is_ok_to_use_qt():
|
||||
raise Exception('Not OK to use Qt')
|
||||
raise Unavailable('Not OK to use Qt')
|
||||
|
||||
@classmethod
|
||||
def config(cls, cfg):
|
||||
|
@ -29,7 +29,7 @@ class RescaleImages(object):
|
||||
|
||||
|
||||
page_width, page_height = self.opts.dest.width, self.opts.dest.height
|
||||
if not self.opts.is_image_collection:
|
||||
if not getattr(self.opts, 'is_image_collection', False):
|
||||
page_width -= (self.opts.margin_left + self.opts.margin_right) * self.opts.dest.dpi/72.
|
||||
page_height -= (self.opts.margin_top + self.opts.margin_bottom) * self.opts.dest.dpi/72.
|
||||
for item in self.oeb.manifest:
|
||||
|
@ -11,12 +11,14 @@ class PDBError(Exception):
|
||||
from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader
|
||||
from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader
|
||||
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
|
||||
from calibre.ebooks.pdb.pdf.reader import Reader as pdf_reader
|
||||
|
||||
FORMAT_READERS = {
|
||||
'PNPdPPrs': ereader_reader,
|
||||
'PNRdPPrs': ereader_reader,
|
||||
'zTXTGPlm': ztxt_reader,
|
||||
'TEXtREAd': palmdoc_reader,
|
||||
'.pdfADBE': pdf_reader,
|
||||
}
|
||||
|
||||
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
|
||||
@ -34,8 +36,8 @@ IDENTITY_TO_NAME = {
|
||||
'PNRdPPrs': 'eReader',
|
||||
'zTXTGPlm': 'zTXT',
|
||||
'TEXtREAd': 'PalmDOC',
|
||||
|
||||
'.pdfADBE': 'Adobe Reader',
|
||||
|
||||
'BVokBDIC': 'BDicty',
|
||||
'DB99DBOS': 'DB (Database program)',
|
||||
'vIMGView': 'FireViewer (ImageViewer)',
|
||||
|
0
src/calibre/ebooks/pdb/pdf/__init__.py
Normal file
0
src/calibre/ebooks/pdb/pdf/__init__.py
Normal file
37
src/calibre/ebooks/pdb/pdf/reader.py
Normal file
37
src/calibre/ebooks/pdb/pdf/reader.py
Normal file
@ -0,0 +1,37 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
'''
|
||||
Read content from palmdoc pdb file.
|
||||
'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ptempfile import TemporaryFile
|
||||
|
||||
class Reader(FormatReader):
|
||||
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.header = header
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.options = options
|
||||
setattr(self.options, 'new_pdf_engine', False)
|
||||
setattr(self.options, 'no_images', False)
|
||||
setattr(self.options, 'unwrap_factor', 0.5)
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
self.log.info('Extracting PDF...')
|
||||
|
||||
with TemporaryFile() as pdf_n:
|
||||
pdf = open(pdf_n, 'rwb')
|
||||
for x in xrange(self.header.section_count()):
|
||||
pdf.write(self.header.section_data(x))
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
pdf.seek(0)
|
||||
return plugin_for_input_format('pdf').convert(pdf, self.options,
|
||||
'pdf', self.log, [])
|
@ -13,7 +13,7 @@ from functools import partial
|
||||
|
||||
from calibre.ebooks import ConversionError, DRMError
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre import isosx, iswindows, islinux
|
||||
from calibre import isosx, iswindows, islinux, isfreebsd
|
||||
from calibre import CurrentDir
|
||||
|
||||
PDFTOHTML = 'pdftohtml'
|
||||
@ -23,7 +23,7 @@ if isosx and hasattr(sys, 'frameworks_dir'):
|
||||
if iswindows and hasattr(sys, 'frozen'):
|
||||
PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
|
||||
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
|
||||
if islinux and getattr(sys, 'frozen_path', False):
|
||||
if (islinux or isfreebsd) and getattr(sys, 'frozen_path', False):
|
||||
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
|
||||
|
||||
def pdftohtml(output_dir, pdf_path, no_images):
|
||||
|
@ -72,14 +72,14 @@ class PML_HTMLizer(object):
|
||||
'ra': ('<span id="r%s"></span><a href="#%s">', '</a>'),
|
||||
'c': ('<div style="text-align: center; margin: auto;">', '</div>'),
|
||||
'r': ('<div style="text-align: right;">', '</div>'),
|
||||
't': ('<div style="margin-left: 5%;">', '</div>'),
|
||||
'T': ('<div style="margin-left: %s;">', '</div>'),
|
||||
't': ('<div style="text-indent: 5%;">', '</div>'),
|
||||
'T': ('<div style="text-indent: %s;">', '</div>'),
|
||||
'i': ('<span style="font-style: italic;">', '</span>'),
|
||||
'u': ('<span style="text-decoration: underline;">', '</span>'),
|
||||
'd': ('<span style="text-decoration: line-through;">', '</span>'),
|
||||
'b': ('<span style="font-weight: bold;">', '</span>'),
|
||||
'l': ('<span style="font-size: 150%;">', '</span>'),
|
||||
'k': ('<span style="font-size: 75%;">', '</span>'),
|
||||
'k': ('<span style="font-size: 75%; font-variant: small-caps;">', '</span>'),
|
||||
'FN': ('<br /><br style="page-break-after: always;" /><div id="fn-%s"><p>', '</p><<small><a href="#rfn-%s">return</a></small></div>'),
|
||||
'SB': ('<br /><br style="page-break-after: always;" /><div id="sb-%s"><p>', '</p><small><a href="#rsb-%s">return</a></small></div>'),
|
||||
}
|
||||
@ -154,6 +154,11 @@ class PML_HTMLizer(object):
|
||||
self.file_name = ''
|
||||
|
||||
def prepare_pml(self, pml):
|
||||
# Give Chapters the form \\*='text'text\\*. This is used for generating
|
||||
# the TOC later.
|
||||
pml = re.sub(r'(?<=\\x)(?P<text>.*?)(?=\\x)', lambda match: '="%s"%s' % (self.strip_pml(match.group('text')), match.group('text')), pml)
|
||||
pml = re.sub(r'(?<=\\X[0-4])(?P<text>.*?)(?=\\X[0-4])', lambda match: '="%s"%s' % (self.strip_pml(match.group('text')), match.group('text')), pml)
|
||||
|
||||
# Remove comments
|
||||
pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml)
|
||||
|
||||
@ -163,7 +168,7 @@ class PML_HTMLizer(object):
|
||||
pml = re.sub(r'(?mus)(?<=.)[ ]*$', '', pml)
|
||||
pml = re.sub(r'(?mus)^[ ]*$', '', pml)
|
||||
|
||||
# Footnotes and Sidebars
|
||||
# Footnotes and Sidebars.
|
||||
pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml)
|
||||
pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml)
|
||||
|
||||
@ -171,9 +176,7 @@ class PML_HTMLizer(object):
|
||||
# &. It will display as &
|
||||
pml = pml.replace('&', '&')
|
||||
|
||||
pml = re.sub(r'(?<=\\x)(?P<text>.*?)(?=\\x)', lambda match: '="%s"%s' % (self.strip_pml(match.group('text')), match.group('text')), pml)
|
||||
pml = re.sub(r'(?<=\\X[0-4])(?P<text>.*?)(?=\\X[0-4])', lambda match: '="%s"%s' % (self.strip_pml(match.group('text')), match.group('text')), pml)
|
||||
|
||||
# Replace \\a and \\U with either the unicode character or the entity.
|
||||
pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml)
|
||||
pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml)
|
||||
|
||||
@ -536,6 +539,7 @@ class PML_HTMLizer(object):
|
||||
elif '%s%s' % (c, l) == 'Sd':
|
||||
text = self.process_code('Sd', line, 'sb')
|
||||
elif c in 'xXC':
|
||||
empty = False
|
||||
# The PML was modified eariler so x and X put the text
|
||||
# inside of ="" so we don't have do special processing
|
||||
# for C.
|
||||
@ -578,9 +582,6 @@ class PML_HTMLizer(object):
|
||||
else:
|
||||
if c != ' ':
|
||||
empty = False
|
||||
if self.state['k'][0]:
|
||||
text = c.upper()
|
||||
else:
|
||||
text = c
|
||||
parsed.append(text)
|
||||
c = line.read(1)
|
||||
|
@ -131,7 +131,7 @@ class PMLMLizer(object):
|
||||
if item.href in self.link_hrefs.keys():
|
||||
toc.append('* \\q="#%s"%s\\q\n' % (self.link_hrefs[item.href], item.title))
|
||||
else:
|
||||
self.oeb.warn('Ignoring toc item: %s not found in document.' % item)
|
||||
self.oeb_book.warn('Ignoring toc item: %s not found in document.' % item)
|
||||
return ''.join(toc)
|
||||
|
||||
def get_text(self):
|
||||
|
@ -131,9 +131,9 @@ class RtfTokenParser():
|
||||
if isString(self.tokens[i].name, "\\'"):
|
||||
i = i + 1
|
||||
if not isinstance(self.tokens[i], tokenData):
|
||||
raise BaseException('Error: token8bitChar without data.')
|
||||
raise Exception('Error: token8bitChar without data.')
|
||||
if len(self.tokens[i].data) < 2:
|
||||
raise BaseException('Error: token8bitChar without data.')
|
||||
raise Exception('Error: token8bitChar without data.')
|
||||
newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
|
||||
if len(self.tokens[i].data) > 2:
|
||||
newTokens.append(tokenData(self.tokens[i].data[2:]))
|
||||
@ -195,7 +195,7 @@ class RtfTokenParser():
|
||||
i = i + 1
|
||||
j = j + 1
|
||||
continue
|
||||
raise BaseException('Error: incorect utf replacement.')
|
||||
raise Exception('Error: incorect utf replacement.')
|
||||
|
||||
#calibre rtf2xml does not support utfreplace
|
||||
replace = []
|
||||
@ -248,7 +248,7 @@ class RtfTokenizer():
|
||||
|
||||
if isChar(self.rtfData[i], '\\'):
|
||||
if i + 1 >= len(self.rtfData):
|
||||
raise BaseException('Error: Control character found at the end of the document.')
|
||||
raise Exception('Error: Control character found at the end of the document.')
|
||||
|
||||
if lastDataStart > -1:
|
||||
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||
@ -269,7 +269,7 @@ class RtfTokenizer():
|
||||
i = i + 1
|
||||
|
||||
if not consumed:
|
||||
raise BaseException('Error (at:%d): Control Word without end.'%(tokenStart))
|
||||
raise Exception('Error (at:%d): Control Word without end.'%(tokenStart))
|
||||
|
||||
#we have numeric argument before delimiter
|
||||
if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
|
||||
@ -283,10 +283,10 @@ class RtfTokenizer():
|
||||
l = l + 1
|
||||
i = i + 1
|
||||
if l > 10 :
|
||||
raise BaseException('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
|
||||
raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
|
||||
|
||||
if not consumed:
|
||||
raise BaseException('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
|
||||
raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
|
||||
|
||||
separator = ''
|
||||
if isChar(self.rtfData[i], ' '):
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user