mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Recipe for Los Angeles Times (thanks to Darko Miletic)
This commit is contained in:
parent
f56e8739cb
commit
69671e1386
@ -627,7 +627,6 @@ def set_metadata(stream, mi):
|
|||||||
|
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
import os.path
|
|
||||||
parser = option_parser()
|
parser = option_parser()
|
||||||
options, args = parser.parse_args(args)
|
options, args = parser.parse_args(args)
|
||||||
if len(args) != 2:
|
if len(args) != 2:
|
||||||
|
@ -124,7 +124,8 @@ class MobiReader(object):
|
|||||||
|
|
||||||
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
||||||
|
|
||||||
def __init__(self, filename_or_stream):
|
def __init__(self, filename_or_stream, verbose=False):
|
||||||
|
self.verbose = verbose
|
||||||
if hasattr(filename_or_stream, 'read'):
|
if hasattr(filename_or_stream, 'read'):
|
||||||
stream = filename_or_stream
|
stream = filename_or_stream
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
@ -189,6 +190,8 @@ class MobiReader(object):
|
|||||||
'</style>\n',
|
'</style>\n',
|
||||||
self.processed_html)
|
self.processed_html)
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
print 'Parsing HTML...'
|
||||||
soup = BeautifulSoup(self.processed_html)
|
soup = BeautifulSoup(self.processed_html)
|
||||||
self.cleanup_soup(soup)
|
self.cleanup_soup(soup)
|
||||||
guide = soup.find('guide')
|
guide = soup.find('guide')
|
||||||
@ -212,6 +215,8 @@ class MobiReader(object):
|
|||||||
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
|
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
|
||||||
|
|
||||||
def cleanup_html(self):
|
def cleanup_html(self):
|
||||||
|
if self.verbose:
|
||||||
|
print 'Cleaning up HTML...'
|
||||||
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
|
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
|
||||||
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
|
||||||
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
|
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
|
||||||
@ -221,6 +226,8 @@ class MobiReader(object):
|
|||||||
self.processed_html = re.sub(r'(?i)</%s>'%t, r'</span>', self.processed_html)
|
self.processed_html = re.sub(r'(?i)</%s>'%t, r'</span>', self.processed_html)
|
||||||
|
|
||||||
def cleanup_soup(self, soup):
|
def cleanup_soup(self, soup):
|
||||||
|
if self.verbose:
|
||||||
|
print 'Replacing height, width and align attributes'
|
||||||
for tag in soup.recursiveChildGenerator():
|
for tag in soup.recursiveChildGenerator():
|
||||||
if not isinstance(tag, Tag): continue
|
if not isinstance(tag, Tag): continue
|
||||||
styles = []
|
styles = []
|
||||||
@ -311,6 +318,8 @@ class MobiReader(object):
|
|||||||
return data[:len(data)-trail_size]
|
return data[:len(data)-trail_size]
|
||||||
|
|
||||||
def extract_text(self):
|
def extract_text(self):
|
||||||
|
if self.verbose:
|
||||||
|
print 'Extracting text...'
|
||||||
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
|
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
|
||||||
processed_records = list(range(0, self.book_header.records+1))
|
processed_records = list(range(0, self.book_header.records+1))
|
||||||
|
|
||||||
@ -343,6 +352,8 @@ class MobiReader(object):
|
|||||||
self.processed_html)
|
self.processed_html)
|
||||||
|
|
||||||
def add_anchors(self):
|
def add_anchors(self):
|
||||||
|
if self.verbose:
|
||||||
|
print 'Adding anchors...'
|
||||||
positions = set([])
|
positions = set([])
|
||||||
link_pattern = re.compile(r'<[^<>]+filepos=[\'"]{0,1}(\d+)[^<>]*>', re.IGNORECASE)
|
link_pattern = re.compile(r'<[^<>]+filepos=[\'"]{0,1}(\d+)[^<>]*>', re.IGNORECASE)
|
||||||
for match in link_pattern.finditer(self.mobi_html):
|
for match in link_pattern.finditer(self.mobi_html):
|
||||||
@ -370,6 +381,8 @@ class MobiReader(object):
|
|||||||
self.processed_html)
|
self.processed_html)
|
||||||
|
|
||||||
def extract_images(self, processed_records, output_dir):
|
def extract_images(self, processed_records, output_dir):
|
||||||
|
if self.verbose:
|
||||||
|
print 'Extracting images...'
|
||||||
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
|
output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
|
||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
@ -438,7 +451,7 @@ def main(args=sys.argv):
|
|||||||
parser.print_help()
|
parser.print_help()
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
mr = MobiReader(args[1])
|
mr = MobiReader(args[1], verbose=opts.verbose)
|
||||||
opts.output_dir = os.path.abspath(opts.output_dir)
|
opts.output_dir = os.path.abspath(opts.output_dir)
|
||||||
mr.extract_content(opts.output_dir)
|
mr.extract_content(opts.output_dir)
|
||||||
if opts.verbose:
|
if opts.verbose:
|
||||||
|
@ -11,7 +11,7 @@ recipes = [
|
|||||||
'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht', 'lanacion',
|
'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht', 'lanacion',
|
||||||
'discover_magazine', 'scientific_american', 'new_york_review_of_books',
|
'discover_magazine', 'scientific_american', 'new_york_review_of_books',
|
||||||
'daily_telegraph', 'guardian', 'el_pais', 'new_scientist', 'b92',
|
'daily_telegraph', 'guardian', 'el_pais', 'new_scientist', 'b92',
|
||||||
'politika', 'moscow_times'
|
'politika', 'moscow_times', 'latimes'
|
||||||
]
|
]
|
||||||
|
|
||||||
import re, imp, inspect, time, os
|
import re, imp, inspect, time, os
|
||||||
|
28
src/calibre/web/feeds/recipes/latimes.py
Normal file
28
src/calibre/web/feeds/recipes/latimes.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
latimes.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class LATimes(BasicNewsRecipe):
|
||||||
|
title = u'The Los Angeles Times'
|
||||||
|
__author__ = u'Darko Miletic'
|
||||||
|
description = u'News from Los Angeles'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
keep_only_tags = [ dict(name='div', attrs={'id':'center' }) ]
|
||||||
|
remove_tags_after = [ dict(name='div', attrs={'id':'socialnet'}) ]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div' , attrs={'id':'wrapper_vid' })
|
||||||
|
,dict(name='div' , attrs={'id':'article_related'})
|
||||||
|
,dict(name='div' , attrs={'id':'socialnet' })
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'News', u'http://feeds.latimes.com/latimes/news')]
|
25
src/calibre/web/feeds/recipes/moscow_times.py
Normal file
25
src/calibre/web/feeds/recipes/moscow_times.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
moscowtimes.ru
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Moscowtimes(BasicNewsRecipe):
|
||||||
|
title = u'The Moscow Times'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'News from Russia'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'The Moscow Times' , u'http://www.themoscowtimes.com/rss.xml' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '&print=Y'
|
Loading…
x
Reference in New Issue
Block a user