mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to trunk.
This commit is contained in:
commit
16c9b7dc1e
63
resources/recipes/ieeespectrum.recipe
Normal file
63
resources/recipes/ieeespectrum.recipe
Normal file
@ -0,0 +1,63 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Franco Venturi <fventuri at comcast.net>'
|
||||
'''
|
||||
spectrum.ieee.org
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from string import capwords
|
||||
from urlparse import urljoin
|
||||
|
||||
class IEEESpectrum(BasicNewsRecipe):
|
||||
title = 'IEEE Spectrum'
|
||||
__author__ = 'Franco Venturi'
|
||||
description = 'Electronics News from IEEE'
|
||||
publisher = 'IEEE'
|
||||
category = 'news, electronics, IT, computer science'
|
||||
oldest_article = 32
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
index = 'http://spectrum.ieee.org/magazine/'
|
||||
masthead_url = 'http://spectrum.ieee.org/images/logo_hdr.png'
|
||||
|
||||
remove_javascript = True
|
||||
remove_tags = [dict(name={'script':True, 'object':True})]
|
||||
remove_attributes = ['height','width','alt']
|
||||
keep_only_tags = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})]
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.index)
|
||||
img = soup.find('img', image='cover.gif', src=True)
|
||||
if img is not None:
|
||||
self.cover_url = 'http://spectrum.ieee.org'+img['src']
|
||||
|
||||
content = soup.find(id='gnrlContent')
|
||||
title = content.find(attrs={'class':'style4'}).string.strip()
|
||||
date = ' '.join(title.split()[0:2])
|
||||
self.timefmt = ' [' + date + ']'
|
||||
contents = []
|
||||
for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}):
|
||||
if tag['class'] == 'style2':
|
||||
contents.append((capwords(tag.renderContents().strip()), []))
|
||||
elif tag['class'] == 'lstngTitle':
|
||||
url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0'
|
||||
contents[-1][1].append({'title': tag.renderContents().strip(),
|
||||
'url': url,
|
||||
'date': date,
|
||||
'description': '',
|
||||
'content': ''
|
||||
})
|
||||
elif tag['class'] == 'lstngBody':
|
||||
contents[-1][1][-1]['description'] = tag.renderContents().strip()
|
||||
|
||||
return contents
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for a in soup.findAll('a'):
|
||||
if not a['href'].lower().startswith('http'):
|
||||
a['href'] = urljoin(self.index, a['href'])
|
||||
return soup
|
@ -31,7 +31,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
# List of sections to exclude
|
||||
# To add a section, copy the section name from the allSectionKeywords list above
|
||||
# For example, to exclude 'Dining' and 'Weddings':
|
||||
# excludeSectionKeywords = ['Dining','Weddings']
|
||||
#excludeSectionKeywords = ['Dining','Weddings']
|
||||
excludeSectionKeywords = []
|
||||
|
||||
# List of sections to include (test and debug only)
|
||||
@ -56,20 +56,25 @@ class NYTimes(BasicNewsRecipe):
|
||||
remove_tags_before = dict(id='article')
|
||||
remove_tags_after = dict(id='article')
|
||||
remove_tags = [dict(attrs={'class':[
|
||||
'articleFooter',
|
||||
'articleTools',
|
||||
'columnGroup doubleRule',
|
||||
'columnGroup singleRule',
|
||||
'columnGroup last',
|
||||
'columnGroup last',
|
||||
'doubleRule',
|
||||
'dottedLine',
|
||||
'entry-meta',
|
||||
'icon enlargeThis',
|
||||
'leftNavTabs',
|
||||
'module box nav',
|
||||
'nextArticleLink',
|
||||
'nextArticleLink clearfix',
|
||||
'post-tools',
|
||||
'relatedSearchesModule',
|
||||
'side_tool',
|
||||
'singleAd',
|
||||
'subNavigation tabContent active clearfix',
|
||||
]}),
|
||||
dict(id=[
|
||||
'adxLeaderboard',
|
||||
@ -222,11 +227,11 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
if div['class'] == 'section-headline':
|
||||
key = string.capwords(feed_title(div))
|
||||
excluded = re.compile('|'.join(self.excludeSectionKeywords))
|
||||
if excluded.search(key):
|
||||
self.log("Skipping section %s" % key)
|
||||
continue
|
||||
|
||||
if self.excludeSectionKeywords:
|
||||
excluded = re.compile('|'.join(self.excludeSectionKeywords))
|
||||
if excluded.search(key):
|
||||
self.log("Skipping section %s" % key)
|
||||
continue
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
|
||||
|
@ -21,9 +21,8 @@ class Timesonline(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
simultaneous_downloads = 1
|
||||
encoding = 'ISO-8859-1'
|
||||
lang = 'en-UK'
|
||||
remove_javascript = True
|
||||
language = 'en'
|
||||
language = 'en_GB'
|
||||
recursions = 9
|
||||
match_regexps = [r'http://www.timesonline.co.uk/.*page=[2-9]']
|
||||
|
||||
|
@ -150,7 +150,8 @@ class PRS505(CLI, Device):
|
||||
for location in locations:
|
||||
info = metadata.next()
|
||||
path = location[0]
|
||||
blist = 2 if location[3] == 'cardb' else 1 if location[3] == 'carda' else 0
|
||||
oncard = location[3]
|
||||
blist = 2 if oncard == 'cardb' else 1 if oncard == 'carda' else 0
|
||||
|
||||
if self._main_prefix and path.startswith(self._main_prefix):
|
||||
name = path.replace(self._main_prefix, '')
|
||||
@ -166,7 +167,11 @@ class PRS505(CLI, Device):
|
||||
|
||||
opts = self.settings()
|
||||
collections = opts.extra_customization.split(',') if opts.extra_customization else []
|
||||
booklists[blist].add_book(info, name, collections, *location[1:-1])
|
||||
booklist = booklists[blist]
|
||||
if not hasattr(booklist, 'add_book'):
|
||||
raise ValueError(('Incorrect upload location %s. Did you choose the'
|
||||
' correct card A or B, to send books to?')%oncard)
|
||||
booklist.add_book(info, name, collections, *location[1:-1])
|
||||
fix_ids(*booklists)
|
||||
|
||||
def delete_books(self, paths, end_session=True):
|
||||
|
@ -230,14 +230,25 @@ class HTMLPreProcessor(object):
|
||||
|
||||
end_rules = []
|
||||
if getattr(self.extra_opts, 'remove_header', None):
|
||||
end_rules.append(
|
||||
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
|
||||
)
|
||||
try:
|
||||
end_rules.append(
|
||||
(re.compile(self.extra_opts.header_regex), lambda match : '')
|
||||
)
|
||||
except:
|
||||
import traceback
|
||||
print 'Failed to parse remove_header regexp'
|
||||
traceback.print_exc()
|
||||
|
||||
if getattr(self.extra_opts, 'remove_footer', None):
|
||||
end_rules.append(
|
||||
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
|
||||
)
|
||||
|
||||
try:
|
||||
end_rules.append(
|
||||
(re.compile(self.extra_opts.footer_regex), lambda match : '')
|
||||
)
|
||||
except:
|
||||
import traceback
|
||||
print 'Failed to parse remove_footer regexp'
|
||||
traceback.print_exc()
|
||||
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||
if length:
|
||||
|
@ -267,7 +267,7 @@ class MetadataUpdater(object):
|
||||
offset += md_len
|
||||
self.metadata[tag] = metadata
|
||||
|
||||
def regenerate_headers(self, len_updated_metadata):
|
||||
def regenerate_headers(self, updated_md_len):
|
||||
|
||||
headers = {}
|
||||
for tag in self.topaz_headers:
|
||||
@ -276,22 +276,16 @@ class MetadataUpdater(object):
|
||||
else:
|
||||
headers[tag] = None
|
||||
|
||||
# Sort headers based on initial offset
|
||||
sh = sorted(headers,key=lambda x:(headers[x],headers[x]))
|
||||
|
||||
# Metadata goes last
|
||||
sh.remove('metadata')
|
||||
sh.append('metadata')
|
||||
|
||||
original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp']
|
||||
original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
|
||||
delta = updated_md_len - original_md_len
|
||||
|
||||
# Copy the first 5 bytes of the file: sig + num_recs
|
||||
ths = StringIO.StringIO()
|
||||
ths.write(self.data[:5])
|
||||
|
||||
# Rewrite the offsets for hdr_offsets > metadata original location
|
||||
for tag in sh[:-1]:
|
||||
# Rewrite the offsets for hdr_offsets > metadata offset
|
||||
for tag in headers.keys():
|
||||
ths.write('c')
|
||||
ths.write(self.encode_vwi(len(tag)))
|
||||
ths.write(tag)
|
||||
@ -300,32 +294,18 @@ class MetadataUpdater(object):
|
||||
for block in self.topaz_headers[tag]['blocks']:
|
||||
b = self.topaz_headers[tag]['blocks'][block]
|
||||
|
||||
if b['offset'] < original_md_offset:
|
||||
if b['offset'] <= original_md_offset:
|
||||
ths.write(self.encode_vwi(b['offset']))
|
||||
else:
|
||||
ths.write(self.encode_vwi(b['offset'] - original_md_len))
|
||||
ths.write(self.encode_vwi(b['offset'] + delta))
|
||||
|
||||
ths.write(self.encode_vwi(b['len_uncomp']))
|
||||
if tag == 'metadata':
|
||||
ths.write(self.encode_vwi(updated_md_len))
|
||||
else:
|
||||
ths.write(self.encode_vwi(b['len_uncomp']))
|
||||
ths.write(self.encode_vwi(b['len_comp']))
|
||||
else:
|
||||
ths.write(self.encode_vwi(0))
|
||||
|
||||
# Adjust metadata offset to end
|
||||
new_md_offset = (len(self.data) - self.base - original_md_len)
|
||||
|
||||
new_md_len = len_updated_metadata - 1 - len('metadata') - 1
|
||||
|
||||
# Write the metadata header
|
||||
ths.write('c')
|
||||
ths.write(self.encode_vwi(len('metadata')))
|
||||
ths.write('metadata')
|
||||
ths.write(self.encode_vwi(1))
|
||||
ths.write(self.encode_vwi(new_md_offset))
|
||||
|
||||
ths.write(self.encode_vwi(new_md_len))
|
||||
ths.write(self.encode_vwi(0))
|
||||
|
||||
self.sorted_headers = sh
|
||||
self.original_md_start = original_md_offset + self.base
|
||||
self.original_md_len = original_md_len
|
||||
return ths.getvalue().encode('iso-8859-1')
|
||||
@ -364,8 +344,8 @@ class MetadataUpdater(object):
|
||||
self.stream.write(head)
|
||||
self.stream.write('d')
|
||||
self.stream.write(chunk1)
|
||||
self.stream.write(chunk2)
|
||||
self.stream.write(updated_metadata)
|
||||
self.stream.write(chunk2)
|
||||
|
||||
def get_metadata(stream):
|
||||
mu = MetadataUpdater(stream)
|
||||
@ -377,6 +357,21 @@ def set_metadata(stream, mi):
|
||||
return
|
||||
|
||||
if __name__ == '__main__':
|
||||
#print get_metadata(open(sys.argv[1], 'rb'))
|
||||
mi = MetaInformation(title="My New Title", authors=['Smith, John'])
|
||||
set_metadata(open(sys.argv[1], 'rb'), mi)
|
||||
if False:
|
||||
# Test get_metadata()
|
||||
print get_metadata(open(sys.argv[1], 'rb'))
|
||||
else:
|
||||
# Test set_metadata()
|
||||
import cStringIO
|
||||
data = open(sys.argv[1], 'rb')
|
||||
stream = cStringIO.StringIO()
|
||||
stream.write(data.read())
|
||||
mi = MetaInformation(title="A Marvelously Long Title", authors=['Riker, Gregory; Riker, Charles'])
|
||||
set_metadata(stream, mi)
|
||||
|
||||
# Write the result
|
||||
tokens = sys.argv[1].rpartition('.')
|
||||
updated_data = open(tokens[0]+'-updated' + '.' + tokens[2],'wb')
|
||||
updated_data.write(stream.getvalue())
|
||||
updated_data.close()
|
||||
|
||||
|
@ -864,10 +864,10 @@ class BasicNewsRecipe(Recipe):
|
||||
self.log.error(_('Could not download cover: %s')%str(err))
|
||||
self.log.debug(traceback.format_exc())
|
||||
if cu is not None:
|
||||
ext = cu.rpartition('.')[-1]
|
||||
ext = cu.split('/')[-1].rpartition('.')[-1]
|
||||
if '?' in ext:
|
||||
ext = ''
|
||||
ext = ext.lower() if ext else 'jpg'
|
||||
ext = ext.lower() if ext and '/' not in ext else 'jpg'
|
||||
cpath = os.path.join(self.output_dir, 'cover.'+ext)
|
||||
if os.access(cu, os.R_OK):
|
||||
with open(cpath, 'wb') as cfile:
|
||||
|
Loading…
x
Reference in New Issue
Block a user