Pull from trunk

This commit is contained in:
Kovid Goyal 2009-02-21 20:42:54 -08:00
commit 1d6a6586a9
20 changed files with 278 additions and 38 deletions

View File

@ -2,7 +2,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
__appname__ = 'calibre'
__version__ = '0.4.138'
__version__ = '0.4.139'
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
'''
Various run time constants.

View File

@ -233,7 +233,7 @@ class RTFMetadataWriter(MetadataWriterPlugin):
class MOBIMetadataWriter(MetadataWriterPlugin):
name = 'Set MOBI metadata'
file_types = set(['mobi', 'prc'])
file_types = set(['mobi', 'prc', 'azw'])
description = _('Set metadata in %s files')%'MOBI'
author = 'Marshall T. Vandegrift'
@ -246,4 +246,4 @@ plugins = [HTML2ZIP]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataWriter')]
x.__name__.endswith('MetadataWriter')]

View File

@ -33,6 +33,7 @@ class CYBOOKG3(USBMS):
EBOOK_DIR_MAIN = "eBooks"
EBOOK_DIR_CARD = "eBooks"
THUMBNAIL_HEIGHT = 144
SUPPORTS_SUB_DIRS = True
def upload_books(self, files, names, on_card=False, end_session=True,

View File

@ -30,7 +30,7 @@ def write_t2b(t2bfile, coverdata=None):
if coverdata != None:
coverdata = StringIO.StringIO(coverdata)
cover = Image.open(coverdata).convert("L")
cover.thumbnail((96, 144))
cover.thumbnail((96, 144), Image.ANTIALIAS)
t2bcover = Image.new('L', (96, 144), 'white')
x, y = cover.size

View File

@ -205,9 +205,8 @@ class HTMLProcessor(Processor, Rationalizer):
def save(self):
for meta in list(self.root.xpath('//meta')):
meta.getparent().remove(meta)
#for img in self.root.xpath('//img[@src]'):
# self.convert_image(img)
Processor.save(self)
# Strip all comments since Adobe DE is petrified of them
Processor.save(self, strip_comments=True)
def remove_first_image(self):
images = self.root.xpath('//img')

View File

@ -331,9 +331,8 @@ class PreProcessor(object):
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'(?i)<{0,1}!\[(end){0,1}if[^>]*>'), lambda match: ''),
# Strip all comments since Adobe DE is petrified of them
(re.compile(r'<!--[^>]*>'), lambda match : ''),
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
@ -447,7 +446,7 @@ class Parser(PreProcessor, LoggingInterface):
def save_path(self):
return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
def save(self):
def save(self, strip_comments=False):
'''
Save processed HTML into the content directory.
Should be called after all HTML processing is finished.
@ -458,7 +457,11 @@ class Parser(PreProcessor, LoggingInterface):
svg.set('xmlns', 'http://www.w3.org/2000/svg')
ans = tostring(self.root, pretty_print=self.opts.pretty_print)
ans = re.compile(r'<head>', re.IGNORECASE).sub('<head>\n\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
ans = re.compile(r'<head>', re.IGNORECASE).sub(
'<head>\n\t<meta http-equiv="Content-Type" '
'content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
with open(self.save_path(), 'wb') as f:
f.write(ans)
return f.name
@ -594,7 +597,7 @@ class Processor(Parser):
mark = etree.Element('hr', style=page_break_before)
elem.addprevious(mark)
def save(self):
def save(self, strip_comments=False):
style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
if sheet is not None:
@ -608,7 +611,7 @@ class Processor(Parser):
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
open(path, 'wb').write(raw)
return Parser.save(self)
return Parser.save(self, strip_comments=strip_comments)
def populate_toc(self, toc):
'''

View File

@ -30,6 +30,7 @@ preferred_source_formats = [
'XHTML',
'PRC',
'AZW',
'FB2',
'RTF',
'PDF',
'TXT',

View File

@ -38,6 +38,7 @@ def extract_embedded_content(doc):
open(fname, 'wb').write(data)
def to_html(fb2file, tdir):
fb2file = os.path.abspath(fb2file)
cwd = os.getcwd()
try:
os.chdir(tdir)
@ -52,7 +53,7 @@ def to_html(fb2file, tdir):
result = transform(doc)
open('index.html', 'wb').write(transform.tostring(result))
try:
mi = get_metadata(open(fb2file, 'rb'))
mi = get_metadata(open(fb2file, 'rb'), 'fb2')
except:
mi = MetaInformation(None, None)
if not mi.title:

Binary file not shown.

After

Width:  |  Height:  |  Size: 295 B

View File

@ -114,10 +114,13 @@ sudo python -c "import urllib2; exec urllib2.urlopen('http://calibre.kovidgoyal.
wget -O- http://calibre.kovidgoyal.net/downloads/${app}-${version}.tar.gz | tar xvz
cd calibre*
python setup.py build &amp;&amp; sudo python setup.py install
sudo calibre_postinstall
</pre>
Note that if your distribution does not have a
correctly compiled libunrar.so, ${app} will not
support rar files.
support rar files. The calibre_postinstall step
is required for device detection and integration
with your desktop environment.
</p>
</div>
</td>

View File

@ -5,10 +5,11 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Contains the logic for parsing feeds.
'''
import time, logging, traceback, copy
import time, logging, traceback, copy, re
from datetime import datetime
from calibre.web.feeds.feedparser import parse
from calibre import entity_to_unicode
from lxml import html
class Article(object):
@ -19,6 +20,11 @@ class Article(object):
self.downloaded = False
self.id = id
self.title = title.strip() if title else title
try:
self.title = re.sub(r'&(\S+);',
entity_to_unicode, self.title)
except:
pass
self.url = url
self.summary = summary
if summary and not isinstance(summary, unicode):
@ -37,6 +43,7 @@ class Article(object):
self.date = published
self.utctime = datetime(*self.date[:6])
self.localtime = self.utctime + self.time_offset
def __repr__(self):
return \
@ -91,7 +98,8 @@ class Feed(object):
if len(self.articles) >= max_articles_per_feed:
break
self.parse_article(item)
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
max_articles_per_feed=100):
self.title = title if title else _('Unknown feed')

View File

@ -30,7 +30,8 @@ recipe_modules = ['recipe_' + r for r in (
'honoluluadvertiser', 'starbulletin', 'exiled', 'indy_star', 'dna',
'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices',
'hindu', 'cincinnati_enquirer', 'physics_world', 'pressonline',
'la_republica', 'physics_today',
'la_republica', 'physics_today', 'chicago_tribune', 'e_novine',
'al_jazeera', 'winsupersite',
)]
import re, imp, inspect, time, os

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
aljazeera.net
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AlJazeera(BasicNewsRecipe):
title = 'Al Jazeera in English'
__author__ = 'Darko Miletic'
description = 'News from Middle East'
publisher = 'Al Jazeera'
category = 'news, politics, middle east'
simultaneous_downloads = 1
delay = 4
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'iso-8859-1'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True'
keep_only_tags = [dict(name='div', attrs={'id':'ctl00_divContent'})]
remove_tags = [
dict(name=['object','link'])
,dict(name='td', attrs={'class':['MostActiveDescHeader','MostActiveDescBody']})
]
feeds = [(u'AL JAZEERA ENGLISH (AJE)', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989' )]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(face=True):
del item['face']
return soup

View File

@ -0,0 +1,82 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from urlparse import urlparse, urlunparse
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from threading import RLock
class ChicagoTribune(BasicNewsRecipe):
title = 'Chicago Tribune'
__author__ = 'Kovid Goyal'
description = 'Politics, local and business news from Chicago'
language = _('English')
use_embedded_content = False
articles_are_obfuscated = True
remove_tags_before = dict(name='h1')
obfuctation_lock = RLock()
feeds = [
('Latest news', 'http://feeds.chicagotribune.com/chicagotribune/news/'),
('Local news', 'http://feeds.chicagotribune.com/chicagotribune/news/local/'),
('Nation/world', 'http://feeds.chicagotribune.com/chicagotribune/news/nationworld/'),
('Hot topics', 'http://feeds.chicagotribune.com/chicagotribune/hottopics/'),
('Most E-mailed stories', 'http://feeds.chicagotribune.com/chicagotribune/email/'),
('Opinion', 'http://feeds.chicagotribune.com/chicagotribune/opinion/'),
('Off Topic', 'http://feeds.chicagotribune.com/chicagotribune/offtopic/'),
('Politics', 'http://feeds.chicagotribune.com/chicagotribune/politics/'),
('Special Reports', 'http://feeds.chicagotribune.com/chicagotribune/special/'),
('Religion News', 'http://feeds.chicagotribune.com/chicagotribune/religion/'),
('Business news', 'http://feeds.chicagotribune.com/chicagotribune/business/'),
('Jobs and Careers', 'http://feeds.chicagotribune.com/chicagotribune/career/'),
('Local scene', 'http://feeds.chicagotribune.com/chicagohomes/localscene/'),
('Phil Rosenthal', 'http://feeds.chicagotribune.com/chicagotribune/rosenthal/'),
('Tech Buzz', 'http://feeds.chicagotribune.com/chicagotribune/techbuzz/'),
('Your Money', 'http://feeds.chicagotribune.com/chicagotribune/yourmoney/'),
('Jon Hilkevitch - Getting around', 'http://feeds.chicagotribune.com/chicagotribune/gettingaround/'),
('Jon Yates - What\'s your problem?', 'http://feeds.chicagotribune.com/chicagotribune/problem/'),
('Garisson Keillor', 'http://feeds.chicagotribune.com/chicagotribune/keillor/'),
('Marks Jarvis - On Money', 'http://feeds.chicagotribune.com/chicagotribune/marksjarvisonmoney/'),
('Sports', 'http://feeds.chicagotribune.com/chicagotribune/sports/'),
('Arts and Architecture', 'http://feeds.chicagotribune.com/chicagotribune/arts/'),
('Books', 'http://feeds.chicagotribune.com/chicagotribune/books/'),
('Magazine', 'http://feeds.chicagotribune.com/chicagotribune/magazine/'),
('Movies', 'http://feeds.chicagotribune.com/chicagotribune/movies/'),
('Music', 'http://feeds.chicagotribune.com/chicagotribune/movies/'),
('TV', 'http://feeds.chicagotribune.com/chicagotribune/tv/'),
('Hypertext', 'http://feeds.chicagotribune.com/chicagotribune/hypertext/'),
('iPhone Blog', 'http://feeds.feedburner.com/redeye/iphoneblog'),
('Julie\'s Health Club', 'http://feeds.chicagotribune.com/chicagotribune_julieshealthclub/'),
]
temp_files = []
def get_article_url(self, article):
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
def get_obfuscated_article(self, url, logger):
with self.obfuctation_lock:
soup = self.index_to_soup(url)
img = soup.find('img', alt='Print')
if img is not None:
a = img.parent.find('a', href=True)
purl = urlparse(url)
xurl = urlunparse(purl[:2] + (a['href'], '', '', ''))
soup = self.index_to_soup(xurl)
for img in soup.findAll('img', src=True):
if img['src'].startswith('/'):
img['src'] = urlunparse(purl[:2]+(img['src'], '', '', ''))
html = unicode(soup)
else:
h1 = soup.find(id='page-title')
body = soup.find(attrs={'class':re.compile('asset-content')})
html = u'<html><head/><body>%s</body></html>'%(unicode(h1)+unicode(body))
self.temp_files.append(PersistentTemporaryFile('_chicago_tribune.xhtml'))
self.temp_files[-1].write(html.encode('utf-8'))
self.temp_files[-1].close()
return self.temp_files[-1].name

View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
e-novine.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class E_novine(BasicNewsRecipe):
title = 'E-Novine'
__author__ = 'Darko Miletic'
description = 'News from Serbia'
publisher = 'E-novine'
category = 'news, politics, Balcans'
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1250'
cover_url = 'http://www.e-novine.com/slike/slike_3/r1/g2008/m03/y3165525326702598.jpg'
remove_javascript = True
use_embedded_content = False
language = _('Serbian')
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':['css_47_0_2844H']})]
remove_tags = [dict(name=['object','link','embed','iframe'])]
feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )]
def preprocess_html(self, soup):
soup.html['xml:lang'] = 'sr-Latn-ME'
soup.html['lang'] = 'sr-Latn-ME'
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
ftag = soup.find('div', attrs={'id':'css_47_0_2844H'})
if ftag:
it = ftag.div
it.extract()
ftag.div.extract()
ftag.insert(0,it)
return soup

View File

@ -19,7 +19,7 @@ class Infobae(BasicNewsRecipe):
no_stylesheets = True
use_embedded_content = False
language = _('Spanish')
encoding = 'iso-8859-1'
encoding = 'cp1252'
cover_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True
@ -28,9 +28,10 @@ class Infobae(BasicNewsRecipe):
, '--category' , category
, '--publisher', publisher
, '--ignore-tables'
, '--ignore-colors'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [
dict(name=['embed','link','object'])

View File

@ -6,8 +6,8 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
lasegunda.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class LaSegunda(BasicNewsRecipe):
title = 'La Segunda'
__author__ = 'Darko Miletic'
@ -21,14 +21,16 @@ class LaSegunda(BasicNewsRecipe):
encoding = 'cp1252'
cover_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
remove_javascript = True
language = _('Spanish')
html2lrf_options = [
'--comment', description
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='table')]
@ -52,10 +54,7 @@ class LaSegunda(BasicNewsRecipe):
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(name='table', width=True):
del item['width']
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -7,11 +7,10 @@ pagina12.com.ar
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Pagina12(BasicNewsRecipe):
title = u'Pagina/12'
title = 'Pagina/12'
__author__ = 'Darko Miletic'
description = 'Noticias de Argentina y el resto del mundo'
publisher = 'La Pagina S.A.'
@ -20,12 +19,14 @@ class Pagina12(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1252'
cover_url = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/TAPAN.jpg')
cover_url = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/tapagn.jpg')
remove_javascript = True
use_embedded_content = False
language = _('Spanish')
html2lrf_options = [
'--comment', description
'--comment', description
, '--category', category
, '--publisher', publisher
]
@ -50,5 +51,3 @@ class Pagina12(BasicNewsRecipe):
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -0,0 +1,28 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Winsupersite(BasicNewsRecipe):
title = u'Supersite for Windows'
description = u'Paul Thurrott SuperSite for Windows'
publisher = 'Paul Thurrott'
__author__ = 'Hypernova'
language = _('English')
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_javascript = True
html2lrf_options = ['--ignore-tables']
html2epub_options = 'linearize_tables = True'
remove_tags_before = dict(name='h1')
preprocess_regexps = [
(re.compile(r'<p>--Paul Thurrott.*</body>', re.DOTALL|re.IGNORECASE),
lambda match: '</body>'),
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.open('http://www.winsupersite.com')
return br
feeds = [(u'Supersite for Windows', u'http://www.winsupersite.com/supersite.xml')]

View File

@ -284,7 +284,13 @@ class gui(OptionlessCommand):
manifest = '<RCC>\n<qresource prefix="/">\n%s\n</qresource>\n</RCC>'%'\n'.join(files)
with open('images.qrc', 'wb') as f:
f.write(manifest)
check_call(['pyrcc4', '-o', images, 'images.qrc'])
try:
check_call(['pyrcc4', '-o', images, 'images.qrc'])
except:
import traceback
traceback.print_exc()
raise Exception('You do not have pyrcc4 in your PATH. '
'Install the PyQt4 development tools.')
else:
print 'Images are up to date'
finally:
@ -670,7 +676,7 @@ class stage3(OptionlessCommand):
def run(self):
OptionlessCommand.run(self)
self.misc()
self.misc()
class stage2(OptionlessCommand):
description = 'Stage 2 of the build process'
@ -699,4 +705,4 @@ class upload(OptionlessCommand):
('stage1', None),
('stage2', None),
('stage3', None)
]
]