mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to trunk.
This commit is contained in:
commit
16c9b7dc1e
63
resources/recipes/ieeespectrum.recipe
Normal file
63
resources/recipes/ieeespectrum.recipe
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Franco Venturi <fventuri at comcast.net>'
|
||||||
|
'''
|
||||||
|
spectrum.ieee.org
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from string import capwords
|
||||||
|
from urlparse import urljoin
|
||||||
|
|
||||||
|
class IEEESpectrum(BasicNewsRecipe):
|
||||||
|
title = 'IEEE Spectrum'
|
||||||
|
__author__ = 'Franco Venturi'
|
||||||
|
description = 'Electronics News from IEEE'
|
||||||
|
publisher = 'IEEE'
|
||||||
|
category = 'news, electronics, IT, computer science'
|
||||||
|
oldest_article = 32
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'en'
|
||||||
|
index = 'http://spectrum.ieee.org/magazine/'
|
||||||
|
masthead_url = 'http://spectrum.ieee.org/images/logo_hdr.png'
|
||||||
|
|
||||||
|
remove_javascript = True
|
||||||
|
remove_tags = [dict(name={'script':True, 'object':True})]
|
||||||
|
remove_attributes = ['height','width','alt']
|
||||||
|
keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.index)
|
||||||
|
img = soup.find('img', image='cover.gif', src=True)
|
||||||
|
if img is not None:
|
||||||
|
self.cover_url = 'http://spectrum.ieee.org'+img['src']
|
||||||
|
|
||||||
|
content = soup.find(id='gnrlContent')
|
||||||
|
title = content.find(attrs={'class':'style4'}).string.strip()
|
||||||
|
date = ' '.join(title.split()[0:2])
|
||||||
|
self.timefmt = ' [' + date + ']'
|
||||||
|
contents = []
|
||||||
|
for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}):
|
||||||
|
if tag['class'] == 'style2':
|
||||||
|
contents.append((capwords(tag.renderContents().strip()), []))
|
||||||
|
elif tag['class'] == 'lstngTitle':
|
||||||
|
url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0'
|
||||||
|
contents[-1][1].append({'title': tag.renderContents().strip(),
|
||||||
|
'url': url,
|
||||||
|
'date': date,
|
||||||
|
'description': '',
|
||||||
|
'content': ''
|
||||||
|
})
|
||||||
|
elif tag['class'] == 'lstngBody':
|
||||||
|
contents[-1][1][-1]['description'] = tag.renderContents().strip()
|
||||||
|
|
||||||
|
return contents
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for a in soup.findAll('a'):
|
||||||
|
if not a['href'].lower().startswith('http'):
|
||||||
|
a['href'] = urljoin(self.index, a['href'])
|
||||||
|
return soup
|
@ -31,7 +31,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# List of sections to exclude
|
# List of sections to exclude
|
||||||
# To add a section, copy the section name from the allSectionKeywords list above
|
# To add a section, copy the section name from the allSectionKeywords list above
|
||||||
# For example, to exclude 'Dining' and 'Weddings':
|
# For example, to exclude 'Dining' and 'Weddings':
|
||||||
# excludeSectionKeywords = ['Dining','Weddings']
|
#excludeSectionKeywords = ['Dining','Weddings']
|
||||||
excludeSectionKeywords = []
|
excludeSectionKeywords = []
|
||||||
|
|
||||||
# List of sections to include (test and debug only)
|
# List of sections to include (test and debug only)
|
||||||
@ -56,20 +56,25 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
remove_tags_before = dict(id='article')
|
remove_tags_before = dict(id='article')
|
||||||
remove_tags_after = dict(id='article')
|
remove_tags_after = dict(id='article')
|
||||||
remove_tags = [dict(attrs={'class':[
|
remove_tags = [dict(attrs={'class':[
|
||||||
|
'articleFooter',
|
||||||
'articleTools',
|
'articleTools',
|
||||||
'columnGroup doubleRule',
|
'columnGroup doubleRule',
|
||||||
|
'columnGroup singleRule',
|
||||||
'columnGroup last',
|
'columnGroup last',
|
||||||
|
'columnGroup last',
|
||||||
'doubleRule',
|
'doubleRule',
|
||||||
'dottedLine',
|
'dottedLine',
|
||||||
'entry-meta',
|
'entry-meta',
|
||||||
'icon enlargeThis',
|
'icon enlargeThis',
|
||||||
'leftNavTabs',
|
'leftNavTabs',
|
||||||
'module box nav',
|
'module box nav',
|
||||||
|
'nextArticleLink',
|
||||||
'nextArticleLink clearfix',
|
'nextArticleLink clearfix',
|
||||||
'post-tools',
|
'post-tools',
|
||||||
'relatedSearchesModule',
|
'relatedSearchesModule',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'singleAd',
|
'singleAd',
|
||||||
|
'subNavigation tabContent active clearfix',
|
||||||
]}),
|
]}),
|
||||||
dict(id=[
|
dict(id=[
|
||||||
'adxLeaderboard',
|
'adxLeaderboard',
|
||||||
@ -222,11 +227,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
if div['class'] == 'section-headline':
|
if div['class'] == 'section-headline':
|
||||||
key = string.capwords(feed_title(div))
|
key = string.capwords(feed_title(div))
|
||||||
excluded = re.compile('|'.join(self.excludeSectionKeywords))
|
if self.excludeSectionKeywords:
|
||||||
if excluded.search(key):
|
excluded = re.compile('|'.join(self.excludeSectionKeywords))
|
||||||
self.log("Skipping section %s" % key)
|
if excluded.search(key):
|
||||||
continue
|
self.log("Skipping section %s" % key)
|
||||||
|
continue
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
ans.append(key)
|
||||||
|
|
||||||
|
@ -21,9 +21,8 @@ class Timesonline(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
simultaneous_downloads = 1
|
simultaneous_downloads = 1
|
||||||
encoding = 'ISO-8859-1'
|
encoding = 'ISO-8859-1'
|
||||||
lang = 'en-UK'
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
language = 'en'
|
language = 'en_GB'
|
||||||
recursions = 9
|
recursions = 9
|
||||||
match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']
|
match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']
|
||||||
|
|
||||||
|
@ -150,7 +150,8 @@ class PRS505(CLI, Device):
|
|||||||
for location in locations:
|
for location in locations:
|
||||||
info = metadata.next()
|
info = metadata.next()
|
||||||
path = location[0]
|
path = location[0]
|
||||||
blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0
|
oncard = location[3]
|
||||||
|
blist = 2 if oncard == 'cardb' else 1 if oncard == 'carda' else 0
|
||||||
|
|
||||||
if self._main_prefix and path.startswith(self._main_prefix):
|
if self._main_prefix and path.startswith(self._main_prefix):
|
||||||
name = path.replace(self._main_prefix, '')
|
name = path.replace(self._main_prefix, '')
|
||||||
@ -166,7 +167,11 @@ class PRS505(CLI, Device):
|
|||||||
|
|
||||||
opts = self.settings()
|
opts = self.settings()
|
||||||
collections = opts.extra_customization.split(',') if opts.extra_customization else []
|
collections = opts.extra_customization.split(',') if opts.extra_customization else []
|
||||||
booklists[blist].add_book(info, name, collections, *location[1:-1])
|
booklist = booklists[blist]
|
||||||
|
if not hasattr(booklist, 'add_book'):
|
||||||
|
raise ValueError(('Incorrect upload location %s. Did you choose the'
|
||||||
|
' correct card A or B, to send books to?')%oncard)
|
||||||
|
booklist.add_book(info, name, collections, *location[1:-1])
|
||||||
fix_ids(*booklists)
|
fix_ids(*booklists)
|
||||||
|
|
||||||
def delete_books(self, paths, end_session=True):
|
def delete_books(self, paths, end_session=True):
|
||||||
|
@ -230,14 +230,25 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
end_rules = []
|
end_rules = []
|
||||||
if getattr(self.extra_opts, 'remove_header', None):
|
if getattr(self.extra_opts, 'remove_header', None):
|
||||||
end_rules.append(
|
try:
|
||||||
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
|
end_rules.append(
|
||||||
)
|
(re.compile(self.extra_opts.header_regex), lambda match : '')
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
print 'Failed to parse remove_header regexp'
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'remove_footer', None):
|
if getattr(self.extra_opts, 'remove_footer', None):
|
||||||
end_rules.append(
|
try:
|
||||||
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
|
end_rules.append(
|
||||||
)
|
(re.compile(self.extra_opts.footer_regex), lambda match : '')
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
print 'Failed to parse remove_footer regexp'
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
if length:
|
if length:
|
||||||
|
@ -267,7 +267,7 @@ class MetadataUpdater(object):
|
|||||||
offset += md_len
|
offset += md_len
|
||||||
self.metadata[tag] = metadata
|
self.metadata[tag] = metadata
|
||||||
|
|
||||||
def regenerate_headers(self, len_updated_metadata):
|
def regenerate_headers(self, updated_md_len):
|
||||||
|
|
||||||
headers = {}
|
headers = {}
|
||||||
for tag in self.topaz_headers:
|
for tag in self.topaz_headers:
|
||||||
@ -276,22 +276,16 @@ class MetadataUpdater(object):
|
|||||||
else:
|
else:
|
||||||
headers[tag] = None
|
headers[tag] = None
|
||||||
|
|
||||||
# Sort headers based on initial offset
|
|
||||||
sh = sorted(headers,key=lambda x:(headers[x],headers[x]))
|
|
||||||
|
|
||||||
# Metadata goes last
|
|
||||||
sh.remove('metadata')
|
|
||||||
sh.append('metadata')
|
|
||||||
|
|
||||||
original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp']
|
original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp']
|
||||||
original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
|
original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
|
||||||
|
delta = updated_md_len - original_md_len
|
||||||
|
|
||||||
# Copy the first 5 bytes of the file: sig + num_recs
|
# Copy the first 5 bytes of the file: sig + num_recs
|
||||||
ths = StringIO.StringIO()
|
ths = StringIO.StringIO()
|
||||||
ths.write(self.data[:5])
|
ths.write(self.data[:5])
|
||||||
|
|
||||||
# Rewrite the offsets for hdr_offsets > metadata original location
|
# Rewrite the offsets for hdr_offsets > metadata offset
|
||||||
for tag in sh[:-1]:
|
for tag in headers.keys():
|
||||||
ths.write('c')
|
ths.write('c')
|
||||||
ths.write(self.encode_vwi(len(tag)))
|
ths.write(self.encode_vwi(len(tag)))
|
||||||
ths.write(tag)
|
ths.write(tag)
|
||||||
@ -300,32 +294,18 @@ class MetadataUpdater(object):
|
|||||||
for block in self.topaz_headers[tag]['blocks']:
|
for block in self.topaz_headers[tag]['blocks']:
|
||||||
b = self.topaz_headers[tag]['blocks'][block]
|
b = self.topaz_headers[tag]['blocks'][block]
|
||||||
|
|
||||||
if b['offset'] < original_md_offset:
|
if b['offset'] <= original_md_offset:
|
||||||
ths.write(self.encode_vwi(b['offset']))
|
ths.write(self.encode_vwi(b['offset']))
|
||||||
else:
|
else:
|
||||||
ths.write(self.encode_vwi(b['offset'] - original_md_len))
|
ths.write(self.encode_vwi(b['offset'] + delta))
|
||||||
|
|
||||||
ths.write(self.encode_vwi(b['len_uncomp']))
|
if tag == 'metadata':
|
||||||
|
ths.write(self.encode_vwi(updated_md_len))
|
||||||
|
else:
|
||||||
|
ths.write(self.encode_vwi(b['len_uncomp']))
|
||||||
ths.write(self.encode_vwi(b['len_comp']))
|
ths.write(self.encode_vwi(b['len_comp']))
|
||||||
else:
|
else:
|
||||||
ths.write(self.encode_vwi(0))
|
ths.write(self.encode_vwi(0))
|
||||||
|
|
||||||
# Adjust metadata offset to end
|
|
||||||
new_md_offset = (len(self.data) - self.base - original_md_len)
|
|
||||||
|
|
||||||
new_md_len = len_updated_metadata - 1 - len('metadata') - 1
|
|
||||||
|
|
||||||
# Write the metadata header
|
|
||||||
ths.write('c')
|
|
||||||
ths.write(self.encode_vwi(len('metadata')))
|
|
||||||
ths.write('metadata')
|
|
||||||
ths.write(self.encode_vwi(1))
|
|
||||||
ths.write(self.encode_vwi(new_md_offset))
|
|
||||||
|
|
||||||
ths.write(self.encode_vwi(new_md_len))
|
|
||||||
ths.write(self.encode_vwi(0))
|
|
||||||
|
|
||||||
self.sorted_headers = sh
|
|
||||||
self.original_md_start = original_md_offset + self.base
|
self.original_md_start = original_md_offset + self.base
|
||||||
self.original_md_len = original_md_len
|
self.original_md_len = original_md_len
|
||||||
return ths.getvalue().encode('iso-8859-1')
|
return ths.getvalue().encode('iso-8859-1')
|
||||||
@ -364,8 +344,8 @@ class MetadataUpdater(object):
|
|||||||
self.stream.write(head)
|
self.stream.write(head)
|
||||||
self.stream.write('d')
|
self.stream.write('d')
|
||||||
self.stream.write(chunk1)
|
self.stream.write(chunk1)
|
||||||
self.stream.write(chunk2)
|
|
||||||
self.stream.write(updated_metadata)
|
self.stream.write(updated_metadata)
|
||||||
|
self.stream.write(chunk2)
|
||||||
|
|
||||||
def get_metadata(stream):
|
def get_metadata(stream):
|
||||||
mu = MetadataUpdater(stream)
|
mu = MetadataUpdater(stream)
|
||||||
@ -377,6 +357,21 @@ def set_metadata(stream, mi):
|
|||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
#print get_metadata(open(sys.argv[1], 'rb'))
|
if False:
|
||||||
mi = MetaInformation(title="My New Title", authors=['Smith, John'])
|
# Test get_metadata()
|
||||||
set_metadata(open(sys.argv[1], 'rb'), mi)
|
print get_metadata(open(sys.argv[1], 'rb'))
|
||||||
|
else:
|
||||||
|
# Test set_metadata()
|
||||||
|
import cStringIO
|
||||||
|
data = open(sys.argv[1], 'rb')
|
||||||
|
stream = cStringIO.StringIO()
|
||||||
|
stream.write(data.read())
|
||||||
|
mi = MetaInformation(title="A Marvelously Long Title", authors=['Riker, Gregory; Riker, Charles'])
|
||||||
|
set_metadata(stream, mi)
|
||||||
|
|
||||||
|
# Write the result
|
||||||
|
tokens = sys.argv[1].rpartition('.')
|
||||||
|
updated_data = open(tokens[0]+'-updated' + '.' + tokens[2],'wb')
|
||||||
|
updated_data.write(stream.getvalue())
|
||||||
|
updated_data.close()
|
||||||
|
|
||||||
|
@ -864,10 +864,10 @@ class BasicNewsRecipe(Recipe):
|
|||||||
self.log.error(_('Could not download cover: %s')%str(err))
|
self.log.error(_('Could not download cover: %s')%str(err))
|
||||||
self.log.debug(traceback.format_exc())
|
self.log.debug(traceback.format_exc())
|
||||||
if cu is not None:
|
if cu is not None:
|
||||||
ext = cu.rpartition('.')[-1]
|
ext = cu.split('/')[-1].rpartition('.')[-1]
|
||||||
if '?' in ext:
|
if '?' in ext:
|
||||||
ext = ''
|
ext = ''
|
||||||
ext = ext.lower() if ext else 'jpg'
|
ext = ext.lower() if ext and '/' not in ext else 'jpg'
|
||||||
cpath = os.path.join(self.output_dir, 'cover.'+ext)
|
cpath = os.path.join(self.output_dir, 'cover.'+ext)
|
||||||
if os.access(cu, os.R_OK):
|
if os.access(cu, os.R_OK):
|
||||||
with open(cpath, 'wb') as cfile:
|
with open(cpath, 'wb') as cfile:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user