Sync to trunk.

This commit is contained in:
John Schember 2010-03-24 18:51:52 -04:00
commit 16c9b7dc1e
7 changed files with 131 additions and 53 deletions

View File

@ -0,0 +1,63 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Franco Venturi <fventuri at comcast.net>'
'''
spectrum.ieee.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
from string import capwords
from urlparse import urljoin
class IEEESpectrum(BasicNewsRecipe):
title = 'IEEE Spectrum'
__author__ = 'Franco Venturi'
description = 'Electronics News from IEEE'
publisher = 'IEEE'
category = 'news, electronics, IT, computer science'
oldest_article = 32
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'en'
index = 'http://spectrum.ieee.org/magazine/'
masthead_url = 'http://spectrum.ieee.org/images/logo_hdr.png'
remove_javascript = True
remove_tags = [dict(name={'script':True, 'object':True})]
remove_attributes = ['height','width','alt']
keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})]
def parse_index(self):
soup = self.index_to_soup(self.index)
img = soup.find('img', image='cover.gif', src=True)
if img is not None:
self.cover_url = 'http://spectrum.ieee.org'+img['src']
content = soup.find(id='gnrlContent')
title = content.find(attrs={'class':'style4'}).string.strip()
date = ' '.join(title.split()[0:2])
self.timefmt = ' [' + date + ']'
contents = []
for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}):
if tag['class'] == 'style2':
contents.append((capwords(tag.renderContents().strip()), []))
elif tag['class'] == 'lstngTitle':
url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0'
contents[-1][1].append({'title': tag.renderContents().strip(),
'url': url,
'date': date,
'description': '',
'content': ''
})
elif tag['class'] == 'lstngBody':
contents[-1][1][-1]['description'] = tag.renderContents().strip()
return contents
def preprocess_html(self, soup):
for a in soup.findAll('a'):
if not a['href'].lower().startswith('http'):
a['href'] = urljoin(self.index, a['href'])
return soup

View File

@ -31,7 +31,7 @@ class NYTimes(BasicNewsRecipe):
# List of sections to exclude
# To add a section, copy the section name from the allSectionKeywords list above
# For example, to exclude 'Dining' and 'Weddings':
# excludeSectionKeywords = ['Dining','Weddings']
#excludeSectionKeywords = ['Dining','Weddings']
excludeSectionKeywords = []
# List of sections to include (test and debug only)
@ -56,20 +56,25 @@ class NYTimes(BasicNewsRecipe):
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':[
'articleFooter',
'articleTools',
'columnGroup doubleRule',
'columnGroup singleRule',
'columnGroup last',
'columnGroup last',
'doubleRule',
'dottedLine',
'entry-meta',
'icon enlargeThis',
'leftNavTabs',
'module box nav',
'nextArticleLink',
'nextArticleLink clearfix',
'post-tools',
'relatedSearchesModule',
'side_tool',
'singleAd',
'subNavigation tabContent active clearfix',
]}),
dict(id=[
'adxLeaderboard',
@ -222,11 +227,11 @@ class NYTimes(BasicNewsRecipe):
if div['class'] == 'section-headline':
key = string.capwords(feed_title(div))
excluded = re.compile('|'.join(self.excludeSectionKeywords))
if excluded.search(key):
self.log("Skipping section %s" % key)
continue
if self.excludeSectionKeywords:
excluded = re.compile('|'.join(self.excludeSectionKeywords))
if excluded.search(key):
self.log("Skipping section %s" % key)
continue
articles[key] = []
ans.append(key)

View File

@ -21,9 +21,8 @@ class Timesonline(BasicNewsRecipe):
use_embedded_content = False
simultaneous_downloads = 1
encoding = 'ISO-8859-1'
lang = 'en-UK'
remove_javascript = True
language = 'en'
language = 'en_GB'
recursions = 9
match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']

View File

@ -150,7 +150,8 @@ class PRS505(CLI, Device):
for location in locations:
info = metadata.next()
path = location[0]
blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0
oncard = location[3]
blist = 2 if oncard == 'cardb' else 1 if oncard == 'carda' else 0
if self._main_prefix and path.startswith(self._main_prefix):
name = path.replace(self._main_prefix, '')
@ -166,7 +167,11 @@ class PRS505(CLI, Device):
opts = self.settings()
collections = opts.extra_customization.split(',') if opts.extra_customization else []
booklists[blist].add_book(info, name, collections, *location[1:-1])
booklist = booklists[blist]
if not hasattr(booklist, 'add_book'):
raise ValueError(('Incorrect upload location %s. Did you choose the'
' correct card A or B, to send books to?')%oncard)
booklist.add_book(info, name, collections, *location[1:-1])
fix_ids(*booklists)
def delete_books(self, paths, end_session=True):

View File

@ -230,14 +230,25 @@ class HTMLPreProcessor(object):
end_rules = []
if getattr(self.extra_opts, 'remove_header', None):
end_rules.append(
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
)
try:
end_rules.append(
(re.compile(self.extra_opts.header_regex), lambda match : '')
)
except:
import traceback
print 'Failed to parse remove_header regexp'
traceback.print_exc()
if getattr(self.extra_opts, 'remove_footer', None):
end_rules.append(
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
)
try:
end_rules.append(
(re.compile(self.extra_opts.footer_regex), lambda match : '')
)
except:
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
if length:

View File

@ -267,7 +267,7 @@ class MetadataUpdater(object):
offset += md_len
self.metadata[tag] = metadata
def regenerate_headers(self, len_updated_metadata):
def regenerate_headers(self, updated_md_len):
headers = {}
for tag in self.topaz_headers:
@ -276,22 +276,16 @@ class MetadataUpdater(object):
else:
headers[tag] = None
# Sort headers based on initial offset
sh = sorted(headers,key=lambda x:(headers[x],headers[x]))
# Metadata goes last
sh.remove('metadata')
sh.append('metadata')
original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp']
original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
delta = updated_md_len - original_md_len
# Copy the first 5 bytes of the file: sig + num_recs
ths = StringIO.StringIO()
ths.write(self.data[:5])
# Rewrite the offsets for hdr_offsets > metadata original location
for tag in sh[:-1]:
# Rewrite the offsets for hdr_offsets > metadata offset
for tag in headers.keys():
ths.write('c')
ths.write(self.encode_vwi(len(tag)))
ths.write(tag)
@ -300,32 +294,18 @@ class MetadataUpdater(object):
for block in self.topaz_headers[tag]['blocks']:
b = self.topaz_headers[tag]['blocks'][block]
if b['offset'] < original_md_offset:
if b['offset'] <= original_md_offset:
ths.write(self.encode_vwi(b['offset']))
else:
ths.write(self.encode_vwi(b['offset'] - original_md_len))
ths.write(self.encode_vwi(b['offset'] + delta))
ths.write(self.encode_vwi(b['len_uncomp']))
if tag == 'metadata':
ths.write(self.encode_vwi(updated_md_len))
else:
ths.write(self.encode_vwi(b['len_uncomp']))
ths.write(self.encode_vwi(b['len_comp']))
else:
ths.write(self.encode_vwi(0))
# Adjust metadata offset to end
new_md_offset = (len(self.data) - self.base - original_md_len)
new_md_len = len_updated_metadata - 1 - len('metadata') - 1
# Write the metadata header
ths.write('c')
ths.write(self.encode_vwi(len('metadata')))
ths.write('metadata')
ths.write(self.encode_vwi(1))
ths.write(self.encode_vwi(new_md_offset))
ths.write(self.encode_vwi(new_md_len))
ths.write(self.encode_vwi(0))
self.sorted_headers = sh
self.original_md_start = original_md_offset + self.base
self.original_md_len = original_md_len
return ths.getvalue().encode('iso-8859-1')
@ -364,8 +344,8 @@ class MetadataUpdater(object):
self.stream.write(head)
self.stream.write('d')
self.stream.write(chunk1)
self.stream.write(chunk2)
self.stream.write(updated_metadata)
self.stream.write(chunk2)
def get_metadata(stream):
mu = MetadataUpdater(stream)
@ -377,6 +357,21 @@ def set_metadata(stream, mi):
return
if __name__ == '__main__':
#print get_metadata(open(sys.argv[1], 'rb'))
mi = MetaInformation(title="My New Title", authors=['Smith, John'])
set_metadata(open(sys.argv[1], 'rb'), mi)
if False:
# Test get_metadata()
print get_metadata(open(sys.argv[1], 'rb'))
else:
# Test set_metadata()
import cStringIO
data = open(sys.argv[1], 'rb')
stream = cStringIO.StringIO()
stream.write(data.read())
mi = MetaInformation(title="A Marvelously Long Title", authors=['Riker, Gregory; Riker, Charles'])
set_metadata(stream, mi)
# Write the result
tokens = sys.argv[1].rpartition('.')
updated_data = open(tokens[0]+'-updated' + '.' + tokens[2],'wb')
updated_data.write(stream.getvalue())
updated_data.close()

View File

@ -864,10 +864,10 @@ class BasicNewsRecipe(Recipe):
self.log.error(_('Could not download cover: %s')%str(err))
self.log.debug(traceback.format_exc())
if cu is not None:
ext = cu.rpartition('.')[-1]
ext = cu.split('/')[-1].rpartition('.')[-1]
if '?' in ext:
ext = ''
ext = ext.lower() if ext else 'jpg'
ext = ext.lower() if ext and '/' not in ext else 'jpg'
cpath = os.path.join(self.output_dir, 'cover.'+ext)
if os.access(cu, os.R_OK):
with open(cpath, 'wb') as cfile: