Improved recipe for Politico

This commit is contained in:
Kovid Goyal 2009-12-17 10:18:32 -07:00
commit 1d0784f7c7
8 changed files with 69 additions and 20 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: cp1252 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
@ -11,9 +12,9 @@ import re, traceback
from calibre.web.feeds.news import BasicNewsRecipe
class Politico(BasicNewsRecipe):
title = 'Politico'
__author__ = 'Darko Miletic'
__author__ = 'Darko Miletic and Sujata Raman'
description = 'Political news from USA'
publisher = 'Capitol News Company, LLC'
category = 'news, politics, USA'
@ -22,23 +23,34 @@ class Politico(BasicNewsRecipe):
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
encoding = 'cp1252'
encoding = 'UTF-8'
language = 'en'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [dict(name=['notags','embed','object','link','img'])]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [
dict(name=['notags','embed','object','link','img']),
]
extra_css = '''
body{font-family:Arial,Sans-serif;}
element.style{color:#FF0000;font-family:Arial,Sans-serif;}
.author{color:#808080;font-size:x-small;}
a{ color:#003399;}
.byline{color:#696969 ; font-size:x-small;}
.story{color:#000000;}
td{color:#000000;}
'''
feeds = [
(u'Top Stories' , u'http://www.politico.com/rss/politicopicks.xml' )
(u'Top Stories' , u'http://www.politico.com/rss/politicopicks.xml' )
,(u'Congress' , u'http://www.politico.com/rss/congress.xml' )
,(u'Ideas' , u'http://www.politico.com/rss/ideas.xml' )
,(u'Life' , u'http://www.politico.com/rss/life.xml' )
@ -48,17 +60,23 @@ class Politico(BasicNewsRecipe):
,(u'Roger Simon' , u'http://www.politico.com/rss/rogersimon.xml' )
,(u'Suite Talk' , u'http://www.politico.com/rss/suitetalk.xml' )
,(u'Playbook' , u'http://www.politico.com/rss/playbook.xml' )
,(u'The Huddle' , u'http://www.politico.com/rss/huddle.xml' )
#(u'The Huddle' , u'http://www.politico.com/rss/huddle.xml' )
]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>'
mtag = '<meta http-equiv="Content-Language" content="en"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
url_pat = re.compile(r'<a href="([^"]+printstory\.cfm[^"]+)"')
url_pat = re.compile(r'<a href="([^"]+print.*\.cfm[^"]+)"')
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
return soup
def print_version(self, url):
raw = self.index_to_soup(url, raw=True)

View File

@ -26,9 +26,12 @@ class Sueddeutsche(BasicNewsRecipe):
dict(name='div', attrs={'id':["artikel","contentTable"]}) ,
]
remove_tags = [ dict(name='link'), dict(name='iframe'),
dict(name='div', attrs={'id':["themenbox","artikelfoot","CAD_AD","rechteSpalte"]}),
dict(name='div', attrs={'id':["themenbox","artikelfoot","CAD_AD","SKY_AD","NT1_AD","rechteSpalte"]}),
dict(name='div', attrs={'class':["similar-article-box","artikelliste","nteaser301bg","pages closed"]}),
dict(name='div', attrs={'class':["listHeader","listHeader2","hr2","item","videoBigButton"]}),
dict(name='p', attrs={'class':["ressortartikeln",]}),
dict(name='div', attrs={'style':["position:relative;"]}),
dict(name='span', attrs={'class':["nlinkheaderteaserschwarz",]}),
dict(name='table', attrs={'class':["kommentare","footer","pageBoxBot","pageAktiv","bgcontent"]}),
dict(name='ul', attrs={'class':["breadcrumb","articles","activities"]}),
dict(name='p', text = "ANZEIGE")
@ -66,3 +69,4 @@ class Sueddeutsche(BasicNewsRecipe):

View File

@ -121,5 +121,7 @@ class ITALICA(EB600):
VENDOR_NAME = 'ITALICA'
WINDOWS_MAIN_MEM = 'EREADER'
WINDOWS_CARD_A_MEM = WINDOWS_MAIN_MEM
OSX_MAIN_MEM = 'Italica eReader Media'
OSX_CARD_A_MEM = OSX_MAIN_MEM

View File

@ -25,7 +25,7 @@ class DRMError(ValueError):
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
'html', 'xhtml', 'pdf', 'pdb', 'prc', 'mobi', 'azw', 'doc',
'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1']
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml']
class HTMLRenderer(object):

View File

@ -16,10 +16,16 @@ class MOBIInput(InputFormatPlugin):
accelerators):
from calibre.ebooks.mobi.reader import MobiReader
from lxml import html
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline)
parse_cache = {}
mr.extract_content('.', parse_cache)
try:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline)
mr.extract_content('.', parse_cache)
except:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline, try_extra_data_fix=True)
mr.extract_content('.', parse_cache)
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw:
if isinstance(raw, unicode):

View File

@ -108,7 +108,7 @@ class EXTHHeader(object):
class BookHeader(object):
def __init__(self, raw, ident, user_encoding, log):
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
self.log = log
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
@ -141,7 +141,8 @@ class BookHeader(object):
self.codec = 'cp1252' if user_encoding is None else user_encoding
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length \
or (try_extra_data_fix and self.length == 0xE4):
self.extra_flags = 0
else:
self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])
@ -229,7 +230,8 @@ class MobiReader(object):
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None):
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
try_extra_data_fix=False):
self.log = log
self.debug = debug
self.embedded_mi = None
@ -284,7 +286,7 @@ class MobiReader(object):
self.book_header = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log)
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
self.name = self.name.decode(self.book_header.codec, 'replace')
def extract_content(self, output_dir, parse_cache):

View File

@ -587,20 +587,32 @@ class DocumentView(QWebView):
if self.manager is not None:
self.manager.next_document()
else:
oopos = self.document.ypos
#print '\nOriginal position:', oopos
self.document.set_bottom_padding(0)
opos = self.document.ypos
#print 'After set padding=0:', self.document.ypos
if opos < oopos:
if self.manager is not None:
self.manager.next_document()
return
lower_limit = opos + delta_y # Max value of top y co-ord after scrolling
max_y = self.document.height - window_height # The maximum possible top y co-ord
if max_y < lower_limit:
#print 'Setting padding to:', lower_limit - max_y
self.document.set_bottom_padding(lower_limit - max_y)
max_y = self.document.height - window_height
lower_limit = min(max_y, lower_limit)
#print 'Scroll to:', lower_limit
if lower_limit > opos:
self.document.scroll_to(self.document.xpos, lower_limit)
actually_scrolled = self.document.ypos - opos
#print 'After scroll pos:', self.document.ypos
self.find_next_blank_line(window_height - actually_scrolled)
#print 'After blank line pos:', self.document.ypos
if self.manager is not None:
self.manager.scrolled(self.scroll_fraction)
#print 'After all:', self.document.ypos
def scroll_by(self, x=0, y=0, notify=True):
old_pos = self.document.ypos

View File

@ -124,6 +124,11 @@ If you do need to reset your metadata due to problems caused by using both
at the same time, then just delete the media.xml file on the Reader using
your PC's file explorer and it will be recreated after disconnection.
With recent reader iterations, SONY, in all its wisdom has decided to try to force you to
use their software. If you install it, it auto-launches whenever you connect the reader.
If you don't want to uninstall it altogether, there are a couple of tricks you can use. The
simplest is to simply re-name the executable file that launches the library program. More detail
`here http://www.mobileread.com/forums/showthread.php?t=65809`_.
Can I use the collections feature of the SONY reader?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~