Merge from trunk

This commit is contained in:
Charles Haley 2010-06-02 08:26:15 +01:00
commit c80e12c5a2
14 changed files with 91 additions and 194 deletions

View File

@ -1,189 +1,76 @@
__license__ = 'GPL v3' import string
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre import strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Newsweek(BasicNewsRecipe): class Newsweek(BasicNewsRecipe):
title = 'Newsweek' title = 'Newsweek'
__author__ = 'Kovid Goyal and Sujata Raman' __author__ = 'Kovid Goyal'
description = 'Weekly news and current affairs in the US' description = 'Weekly news and current affairs in the US'
language = 'en'
encoding = 'utf-8'
no_stylesheets = True no_stylesheets = True
extra_css = ''' BASE_URL = 'http://www.newsweek.com'
h1{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#383733;} INDEX = BASE_URL+'/topics.html'
.deck{font-family:Georgia,sans-serif; color:#383733;}
.bylineDate{font-family:georgia ; color:#58544A; font-size:x-small;}
.authorInfo{font-family:arial,helvetica,sans-serif; color:#0066CC; font-size:x-small;}
.articleUpdated{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
.issueDate{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small; font-style:italic;}
h5{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
h6{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
.story{font-family:georgia,sans-serif ;color:black;}
.photoCredit{color:#999999; font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
.photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
.fwArticle{font-family:Arial,Helvetica,sans-serif;font-size:x-small;font-weight:bold;}
'''
encoding = 'utf-8' keep_only_tags = dict(name='article', attrs={'class':'article-text'})
language = 'en' remove_tags = [dict(attrs={'data-dartad':True})]
remove_attributes = ['property']
remove_tags = [ def postprocess_html(self, soup, first):
{'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content', for tag in soup.findAll(name=['article', 'header']):
'inline-social-links-wrapper', 'email-article','ToolBox', tag.name = 'div'
'inline-promo-link', 'sponsorship', return soup
'inlineComponentRight',
'comments-and-social-links-wrapper', 'EmailArticleBlock']}, def newsweek_sections(self):
{'id' : ['footer', 'ticker-data', 'topTenVertical', soup = self.index_to_soup(self.INDEX)
'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing', for a in soup.findAll('a', title='Primary tag', href=True):
'ToolBox', 'EmailMain']}, yield (string.capitalize(self.tag_to_string(a)),
{'class': re.compile('related-cloud')}, self.BASE_URL+a['href'])
dict(name='li', attrs={'id':['slug_bigbox']})
]
keep_only_tags = [{'class':['article HorizontalHeader', def newsweek_parse_section_page(self, soup):
'articlecontent','photoBox', 'article columnist first']}, ] for article in soup.findAll('article', about=True,
recursions = 1 attrs={'class':'stream-item'}):
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] title = article.find(attrs={'property': 'dc:title'})
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] if title is None: continue
title = self.tag_to_string(title)
def find_title(self, section): url = self.BASE_URL + article['about']
d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features', desc = ''
None:'Departments', 'culture':'Culture'} author = article.find({'property':'dc:creator'})
ans = None if author:
a = section.find('a', attrs={'name':True}) desc = u'by %s. '%self.tag_to_string(author)
if a is not None: p = article.find(attrs={'property':'dc:abstract'})
ans = a['name'] if p is not None:
return d.get(ans, ans) for a in p.find('a'): a.extract()
desc += self.tag_to_string(p)
t = article.find('time', attrs={'property':'dc:created'})
def find_articles(self, section): date = ''
ans = [] if t is not None:
for x in section.findAll('h5'): date = u' [%s]'%self.tag_to_string(t)
title = ' '.join(x.findAll(text=True)).strip() self.log('\tFound article:', title, 'at', url)
a = x.find('a') self.log('\t\t', desc)
if not a: continue yield {'title':title, 'url':url, 'description':desc, 'date':date}
href = a['href']
ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')})
if not ans:
for x in section.findAll('div', attrs={'class':'hdlItem'}):
a = x.find('a', href=True)
if not a : continue
title = ' '.join(a.findAll(text=True)).strip()
href = a['href']
if 'http://xtra.newsweek.com' in href: continue
ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')})
#for x in ans:
# x['url'] += '/output/print'
return ans
def parse_index(self): def parse_index(self):
soup = self.get_current_issue() sections = []
if not soup: for section, shref in self.newsweek_sections():
raise RuntimeError('Unable to connect to newsweek.com. Try again later.') self.log('Processing section', section, shref)
sections = soup.findAll('div', attrs={'class':'featurewell'}) articles = []
titles = map(self.find_title, sections) soups = [self.index_to_soup(shref)]
articles = map(self.find_articles, sections) na = soups[0].find('a', rel='next')
ans = list(zip(titles, articles)) if na:
def fcmp(x, y): soups.append(self.index_to_soup(self.BASE_URL+na['href']))
tx, ty = x[0], y[0] for soup in soups:
if tx == "Features": return cmp(1, 2) articles.extend(self.newsweek_parse_section_page(soup))
if ty == "Features": return cmp(2, 1) if self.test and len(articles) > 1:
return cmp(tx, ty) break
return sorted(ans, cmp=fcmp) if articles:
sections.append((section, articles))
def ensure_html(self, soup): if self.test and len(sections) > 1:
root = soup.find(name=True) break
if root.name == 'html': return soup return sections
nsoup = BeautifulSoup('<html><head></head><body/></html>')
nroot = nsoup.find(name='body')
for x in soup.contents:
if getattr(x, 'name', False):
x.extract()
nroot.insert(len(nroot), x)
return nsoup
def postprocess_html(self, soup, first_fetch):
if not first_fetch:
h1 = soup.find(id='headline')
if h1:
h1.extract()
div = soup.find(attrs={'class':'articleInfo'})
if div:
div.extract()
divs = list(soup.findAll('div', 'pagination'))
if not divs:
return self.ensure_html(soup)
for div in divs[1:]: div.extract()
all_a = divs[0].findAll('a', href=True)
divs[0]['style']="display:none"
if len(all_a) > 1:
all_a[-1].extract()
test = re.compile(self.match_regexps[0])
for a in soup.findAll('a', href=test):
if a not in all_a:
del a['href']
return self.ensure_html(soup)
def get_current_issue(self):
soup = self.index_to_soup('http://www.newsweek.com')
div = soup.find('div', attrs={'class':re.compile('more-from-mag')})
if div is None: return None
a = div.find('a')
if a is not None:
href = a['href'].split('#')[0]
return self.index_to_soup(href)
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup('http://www.newsweek.com')
link_item = soup.find('div',attrs={'class':'cover-image'})
if link_item and link_item.a and link_item.a.img:
cover_url = link_item.a.img['src']
return cover_url
def postprocess_book(self, oeb, opts, log) :
def extractByline(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
byline = soup.find(True,attrs={'class':'authorInfo'})
byline = self.tag_to_string(byline) if byline is not None else ''
issueDate = soup.find(True,attrs={'class':'issueDate'})
issueDate = self.tag_to_string(issueDate) if issueDate is not None else ''
issueDate = re.sub(',','', issueDate)
if byline > '' and issueDate > '' :
return byline + ' | ' + issueDate
else :
return byline + issueDate
def extractDescription(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
description = soup.find(True,attrs={'name':'description'})
if description is not None and description.has_key('content'):
description = description['content']
if description.startswith('Newsweek magazine online plus') :
description = soup.find(True, attrs={'class':'story'})
firstPara = soup.find('p')
description = self.tag_to_string(firstPara)
else :
description = soup.find(True, attrs={'class':'story'})
firstPara = soup.find('p')
description = self.tag_to_string(firstPara)
return description
for section in oeb.toc :
for article in section :
if article.author is None :
article.author = extractByline(article.href)
if article.description is None :
article.description = extractDescription(article.href)
return

View File

@ -29,7 +29,7 @@ class Plugin(object):
''' '''
#: List of platforms this plugin works on #: List of platforms this plugin works on
#: For example: ``['windows', 'osx', 'linux'] #: For example: ``['windows', 'osx', 'linux']``
supported_platforms = [] supported_platforms = []
#: The name of this plugin. You must set it something other #: The name of this plugin. You must set it something other
@ -214,10 +214,8 @@ class MetadataReaderPlugin(Plugin):
Return metadata for the file represented by stream (a file like object Return metadata for the file represented by stream (a file like object
that supports reading). Raise an exception when there is an error that supports reading). Raise an exception when there is an error
with the input data. with the input data.
:param type: The type of file. Guaranteed to be one of the entries :param type: The type of file. Guaranteed to be one of the entries
in :attr:`file_types`. in :attr:`file_types`.
:return: A :class:`calibre.ebooks.metadata.MetaInformation` object :return: A :class:`calibre.ebooks.metadata.MetaInformation` object
''' '''
return None return None
@ -245,11 +243,9 @@ class MetadataWriterPlugin(Plugin):
Set metadata for the file represented by stream (a file like object Set metadata for the file represented by stream (a file like object
that supports reading). Raise an exception when there is an error that supports reading). Raise an exception when there is an error
with the input data. with the input data.
:param type: The type of file. Guaranteed to be one of the entries :param type: The type of file. Guaranteed to be one of the entries
in :attr:`file_types`. in :attr:`file_types`.
:param mi: A :class:`calibre.ebooks.metadata.MetaInformation` object :param mi: A :class:`calibre.ebooks.metadata.MetaInformation` object
''' '''
pass pass

View File

@ -14,8 +14,14 @@ XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg' SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink' XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', convert_entities = functools.partial(entity_to_unicode,
'apos', 'lt', 'gt', 'amp', '#60', '#62']) result_exceptions = {
u'<' : '&lt;',
u'>' : '&gt;',
u"'" : '&apos;',
u'"' : '&quot;',
u'&' : '&amp;',
})
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE) _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
LIGATURES = { LIGATURES = {

View File

@ -3,7 +3,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import struct, array, zlib, cStringIO, collections, re import struct, array, zlib, cStringIO, collections, re
from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE from calibre.ebooks.lrf import LRFParseError, PRS500_PROFILE
from calibre import entity_to_unicode from calibre import entity_to_unicode, prepare_string_for_xml
from calibre.ebooks.lrf.tags import Tag from calibre.ebooks.lrf.tags import Tag
ruby_tags = { ruby_tags = {
@ -870,7 +870,7 @@ class Text(LRFStream):
open_containers = collections.deque() open_containers = collections.deque()
for c in self.content: for c in self.content:
if isinstance(c, basestring): if isinstance(c, basestring):
s += c s += prepare_string_for_xml(c)
elif c is None: elif c is None:
if open_containers: if open_containers:
p = open_containers.pop() p = open_containers.pop()

View File

@ -787,7 +787,6 @@ class Manifest(object):
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = self.oeb.html_preprocessor(data) data = self.oeb.html_preprocessor(data)
# Remove DOCTYPE declaration as it messes up parsing # Remove DOCTYPE declaration as it messes up parsing
# In particular, it causes tostring to insert xmlns # In particular, it causes tostring to insert xmlns
# declarations, which messes up the coercing logic # declarations, which messes up the coercing logic

View File

@ -136,6 +136,8 @@ class CoverManager(object):
href = g['cover'].href href = g['cover'].href
else: else:
href = self.default_cover() href = self.default_cover()
if href is None:
return
width, height = self.inspect_cover(href) width, height = self.inspect_cover(href)
if width is None or height is None: if width is None or height is None:
self.log.warning('Failed to read cover dimensions') self.log.warning('Failed to read cover dimensions')

View File

@ -14,6 +14,7 @@ from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder
from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit
from calibre.gui2 import error_dialog, choose_files from calibre.gui2 import error_dialog, choose_files
from calibre.ebooks.oeb.iterator import EbookIterator from calibre.ebooks.oeb.iterator import EbookIterator
from calibre.ebooks.conversion.preprocess import convert_entities
from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.gui2.dialogs.choose_format import ChooseFormatDialog
class RegexBuilder(QDialog, Ui_RegexBuilder): class RegexBuilder(QDialog, Ui_RegexBuilder):
@ -87,8 +88,10 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
self.iterator = EbookIterator(pathtoebook) self.iterator = EbookIterator(pathtoebook)
self.iterator.__enter__(only_input_plugin=True) self.iterator.__enter__(only_input_plugin=True)
text = [u''] text = [u'']
ent_pat = re.compile(r'&(\S+?);')
for path in self.iterator.spine: for path in self.iterator.spine:
html = open(path, 'rb').read().decode('utf-8', 'replace') html = open(path, 'rb').read().decode('utf-8', 'replace')
html = ent_pat.sub(convert_entities, html)
text.append(html) text.append(html)
self.preview.setPlainText('\n---\n'.join(text)) self.preview.setPlainText('\n---\n'.join(text))

View File

@ -1123,12 +1123,12 @@ class DeviceGUI(object):
if cache: if cache:
if id in cache['db_ids']: if id in cache['db_ids']:
loc[i] = True loc[i] = True
break continue
if mi.authors and \ if mi.authors and \
re.sub('(?u)\W|[_]', '', authors_to_string(mi.authors).lower()) \ re.sub('(?u)\W|[_]', '', authors_to_string(mi.authors).lower()) \
in cache['authors']: in cache['authors']:
loc[i] = True loc[i] = True
break continue
return loc return loc
def set_books_in_library(self, booklists, reset=False): def set_books_in_library(self, booklists, reset=False):

View File

@ -17,7 +17,7 @@ from calibre.utils.config import tweaks
from calibre.utils.date import parse_date, now, UNDEFINED_DATE from calibre.utils.date import parse_date, now, UNDEFINED_DATE
from calibre.utils.search_query_parser import SearchQueryParser from calibre.utils.search_query_parser import SearchQueryParser
from calibre.utils.pyparsing import ParseException from calibre.utils.pyparsing import ParseException
# from calibre.library.field_metadata import FieldMetadata from calibre.ebooks.metadata import title_sort
class CoverCache(QThread): class CoverCache(QThread):
@ -564,7 +564,8 @@ class ResultCache(SearchQueryParser):
def seriescmp(self, x, y): def seriescmp(self, x, y):
sidx = self.FIELD_MAP['series'] sidx = self.FIELD_MAP['series']
try: try:
ans = cmp(self._data[x][sidx].lower(), self._data[y][sidx].lower()) ans = cmp(title_sort(self._data[x][sidx].lower()),
title_sort(self._data[y][sidx].lower()))
except AttributeError: # Some entries may be None except AttributeError: # Some entries may be None
ans = cmp(self._data[x][sidx], self._data[y][sidx]) ans = cmp(self._data[x][sidx], self._data[y][sidx])
if ans != 0: return ans if ans != 0: return ans

View File

@ -725,6 +725,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
categories[category] = [Tag(formatter(r[1]), count=r[2], id=r[0], categories[category] = [Tag(formatter(r[1]), count=r[2], id=r[0],
icon=icon, tooltip = tooltip) icon=icon, tooltip = tooltip)
for r in data if item_not_zero_func(r)] for r in data if item_not_zero_func(r)]
if category == 'series':
categories[category].sort(cmp=lambda x,y:cmp(title_sort(x.name),
title_sort(y.name)))
# We delayed computing the standard formats category because it does not # We delayed computing the standard formats category because it does not
# use a view, but is computed dynamically # use a view, but is computed dynamically

View File

@ -16,7 +16,7 @@ except ImportError:
from calibre import fit_image, guess_type from calibre import fit_image, guess_type
from calibre.utils.date import fromtimestamp from calibre.utils.date import fromtimestamp
from calibre.ebooks.metadata import title_sort
class ContentServer(object): class ContentServer(object):
@ -67,7 +67,7 @@ class ContentServer(object):
def seriescmp(self, x, y): def seriescmp(self, x, y):
si = self.db.FIELD_MAP['series'] si = self.db.FIELD_MAP['series']
try: try:
ans = cmp(x[si].lower(), y[si].lower()) ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
except AttributeError: # Some entries may be None except AttributeError: # Some entries may be None
ans = cmp(x[si], y[si]) ans = cmp(x[si], y[si])
if ans != 0: return ans if ans != 0: return ans

View File

@ -453,7 +453,7 @@ as HTML and then convert the resulting HTML file with |app|. When saving as HTML
There is a Word macro package that can automate the conversion of Word documents using |app|. It also makes There is a Word macro package that can automate the conversion of Word documents using |app|. It also makes
generating the Table of Contents much simpler. It is called BookCreator and is available for free generating the Table of Contents much simpler. It is called BookCreator and is available for free
`here <http://www.mobileread.com/forums/showthread.php?t=28313>`_. at `mobileread <http://www.mobileread.com/forums/showthread.php?t=28313>`_.
Convert TXT documents Convert TXT documents
~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~
@ -493,7 +493,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec
allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables, allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection
expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document. expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document.
You can learn more about the markdown syntax `here <http://daringfireball.net/projects/markdown/syntax>`_. You can learn more about the markdown syntax at `daringfireball <http://daringfireball.net/projects/markdown/syntax>`_.
Convert PDF documents Convert PDF documents
@ -540,7 +540,7 @@ EPUB advanced formatting demo
Various advanced formatting for EPUB files is demonstrated in this `demo file <http://calibre-ebook.com/downloads/demos/demo.epub>`_. Various advanced formatting for EPUB files is demonstrated in this `demo file <http://calibre-ebook.com/downloads/demos/demo.epub>`_.
The file was created from hand coded HTML using calibre and is meant to be used as a template for your own EPUB creation efforts. The file was created from hand coded HTML using calibre and is meant to be used as a template for your own EPUB creation efforts.
The source HTML it was created from is available `here <http://calibre-ebook.com/downloads/demos/demo.zip>`_. The settings used to create the The source HTML it was created from is available `demo.zip <http://calibre-ebook.com/downloads/demos/demo.zip>`_. The settings used to create the
EPUB from the ZIP file are:: EPUB from the ZIP file are::
ebook-convert demo.zip .epub -vv --authors "Kovid Goyal" --language en --level1-toc '//*[@class="title"]' --disable-font-rescaling --page-breaks-before / --no-default-epub-cover ebook-convert demo.zip .epub -vv --authors "Kovid Goyal" --language en --level1-toc '//*[@class="title"]' --disable-font-rescaling --page-breaks-before / --no-default-epub-cover

View File

@ -133,7 +133,7 @@ Can I use the collections feature of the SONY reader?
turned into a collection on the reader. Note that the PRS-500 does not support collections for books stored on the SD card. The PRS-505 does. turned into a collection on the reader. Note that the PRS-500 does not support collections for books stored on the SD card. The PRS-505 does.
How do I use |app| with my iPad/iPhone/iTouch? How do I use |app| with my iPad/iPhone/iTouch?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You can access your calibre library on a iPad/iPhone/iTouch over the air using the calibre content server. You can access your calibre library on a iPad/iPhone/iTouch over the air using the calibre content server.

View File

@ -146,7 +146,7 @@ class BasicNewsRecipe(Recipe):
#: If True empty feeds are removed from the output. #: If True empty feeds are removed from the output.
#: This option has no effect if parse_index is overriden in #: This option has no effect if parse_index is overriden in
#: the sub class. It is meant only for recipes that return a list #: the sub class. It is meant only for recipes that return a list
#: of feeds using :member:`feeds` or :method:`get_feeds`. #: of feeds using `feeds` or :method:`get_feeds`.
remove_empty_feeds = False remove_empty_feeds = False
#: List of regular expressions that determines which links to follow #: List of regular expressions that determines which links to follow
@ -256,7 +256,7 @@ class BasicNewsRecipe(Recipe):
#: The CSS that is used to styles the templates, i.e., the navigation bars and #: The CSS that is used to styles the templates, i.e., the navigation bars and
#: the Tables of Contents. Rather than overriding this variable, you should #: the Tables of Contents. Rather than overriding this variable, you should
#: use :member:`extra_css` in your recipe to customize look and feel. #: use `extra_css` in your recipe to customize look and feel.
template_css = u''' template_css = u'''
.article_date { .article_date {
color: gray; font-family: monospace; color: gray; font-family: monospace;
@ -506,7 +506,7 @@ class BasicNewsRecipe(Recipe):
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
''' '''
If you set :member:`articles_are_obfuscated` this method is called with If you set `articles_are_obfuscated` this method is called with
every article URL. It should return the path to a file on the filesystem every article URL. It should return the path to a file on the filesystem
that contains the article HTML. That file is processed by the recursive that contains the article HTML. That file is processed by the recursive
HTML fetching engine, so it can contain links to pages/images on the web. HTML fetching engine, so it can contain links to pages/images on the web.