Merge from main branch plus maybe my changes
198
Changelog.yaml
@ -4,6 +4,204 @@
|
|||||||
# for important features/bug fixes.
|
# for important features/bug fixes.
|
||||||
# Also, each release can have new and improved recipes.
|
# Also, each release can have new and improved recipes.
|
||||||
|
|
||||||
|
- version: 0.6.36
|
||||||
|
date: 2010-01-25
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: Catalog generation in MOBI format
|
||||||
|
|
||||||
|
- title: "Driver for Inves Book 600"
|
||||||
|
|
||||||
|
- title: "Show notifications on OS X even when systray icon is disabled. "
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: Fix memory leak in catalog generation
|
||||||
|
|
||||||
|
- title: Fix regression that broke PML output
|
||||||
|
|
||||||
|
- title: Fix bug in MOBI Input
|
||||||
|
tickets: [4643]
|
||||||
|
|
||||||
|
- title: "Replace commas with semi-colons in download tags"
|
||||||
|
tickets: [4650]
|
||||||
|
|
||||||
|
- title: Fix catalog output format dropdown empty in linux
|
||||||
|
tickets: [4656]
|
||||||
|
|
||||||
|
- title: "Fix display of non-English characters in OS X notifications"
|
||||||
|
tickets: [4654]
|
||||||
|
|
||||||
|
- title: Add .cbc to list of book formats
|
||||||
|
tickets: [4662]
|
||||||
|
|
||||||
|
- title: "Content server: Mobile page breaks if library contains empty books. Now fixed."
|
||||||
|
|
||||||
|
- title: "Support old 212 byte header PDB files"
|
||||||
|
tickets: [4646]
|
||||||
|
|
||||||
|
- title: "Fix regression that caused wrong error message to be displayed when device is out of space"
|
||||||
|
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Harvard Business Review Blogs
|
||||||
|
author: Brian_G
|
||||||
|
|
||||||
|
- title: Neowin
|
||||||
|
author: Darko Miletic
|
||||||
|
|
||||||
|
- title: Greensboro News and Record
|
||||||
|
author: Walt Anthony
|
||||||
|
|
||||||
|
- title: Hot Air
|
||||||
|
author: Walt Anthony
|
||||||
|
|
||||||
|
- title: ionline
|
||||||
|
author: Darko Miletic
|
||||||
|
|
||||||
|
- title: The National Review Online
|
||||||
|
author: Walt Anthony
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Ars Technica
|
||||||
|
- Sports Illustrated
|
||||||
|
- Common Dreams
|
||||||
|
- Wired Magazine
|
||||||
|
|
||||||
|
|
||||||
|
- version: 0.6.35
|
||||||
|
date: 2010-01-22
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: Catalog generation
|
||||||
|
type: major
|
||||||
|
description: >
|
||||||
|
"You can now easily generate a catlog of all books in your calibre library by clicking the arrow next to the convert button. The catalog can be in one of several formats: XML, CSV, EPUB and MOBI, with scope for future formats via plugins. If you generate the catalog in an e-book format, it will be automatically sent to your e-book reader the next time you connect it, allowing you to easily browse your collection on the reader itself. This feature is in Beta (may have bugs) so feedback is appreciated."
|
||||||
|
|
||||||
|
- title: "RTF Input: Support for unicode characters."
|
||||||
|
type: major
|
||||||
|
tickets: [4501]
|
||||||
|
|
||||||
|
- title: "Add Quick Start Guide by John Schember to calibre library on first run of calibre"
|
||||||
|
type: major
|
||||||
|
|
||||||
|
- title: "Improve handling of justification"
|
||||||
|
description: >
|
||||||
|
"Now calibre will explicitly change the justification of all left aligned paragraphs to justified or vice versa depending on the justification setting. This should make it possible to robustly convert all content to either justified or not. calibre will not touch centered or right aligned content."
|
||||||
|
|
||||||
|
- title: "E-book viewer: Fit images to viewer window (can be turned off via Preferences)"
|
||||||
|
|
||||||
|
- title: "Add section on E-book viewer to User Manual"
|
||||||
|
|
||||||
|
- title: "Development environment: First look for resources in the location pointed to by CALIBRE_DEVELOP_FROM. If not found, use the normal resource location"
|
||||||
|
|
||||||
|
- title: "When reading metadata from filenames, with the Swap author names option checked, improve the logic used to detect author last name."
|
||||||
|
tickets: [4620]
|
||||||
|
|
||||||
|
- title: "News downloads: When getting an article URL from a RSS feed, look first for an original article link. This speeds up the download of news services that use a syndication service like feedburner or pheedo to publish their RSS feeds."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Windows device detection: Don't do expensive polling while waiting for device disconnect. This should fix the problems people have with their floppy drive being activated while an e-book reader is connected"
|
||||||
|
|
||||||
|
- title: "PML Input: Fix creation of metadata Table of Contents"
|
||||||
|
tickets: [5633]
|
||||||
|
|
||||||
|
- title: "Fix Tag browser not updating after using delete specific format actions"
|
||||||
|
tickets: [4632]
|
||||||
|
|
||||||
|
- title: "MOBI Output: Don't die when converting EPUB files with SVG covers"
|
||||||
|
|
||||||
|
- title: "Nook driver: Remove the # character from filenames when sending to device"
|
||||||
|
tickets: [4629]
|
||||||
|
|
||||||
|
- title: "Workaround for bug in QtWebKit on windows that could cause crashes when using the next page button in the e-book viewer for certain files"
|
||||||
|
tickets: [4606]
|
||||||
|
|
||||||
|
- title: "MOBI Input: Rescale img width and height attributes that were specified in em units"
|
||||||
|
tickets: [4608]
|
||||||
|
|
||||||
|
- title: "ebook-meta: Fix setting of series metadata"
|
||||||
|
|
||||||
|
- title: "RTF metadata: Fix reading metadata from very small files"
|
||||||
|
|
||||||
|
- title: "Conversion pipeline: Don't error out if the user sets an invalid chapter detection XPath"
|
||||||
|
|
||||||
|
- title: "Fix main mem and card being swapped in pocketbook detection on OS X"
|
||||||
|
|
||||||
|
- title: "Welcome wizard: Set the language to english if the user doesn't explicitly change the language. This ensures that the language will be english on windows by default"
|
||||||
|
|
||||||
|
- title: "Fix bug in OEBWriter that could cause writing out of resources in subdirectories with URL unsafe names to fail"
|
||||||
|
|
||||||
|
- title: "E-book viewer: Change highlight color to yellow on all platforms."
|
||||||
|
tickets: [4641]
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Frankfurter Rundschau
|
||||||
|
author: Justus Bisser
|
||||||
|
|
||||||
|
- title: The Columbia Hournalism Review
|
||||||
|
author: XanthanGum
|
||||||
|
|
||||||
|
- title: Various CanWest Canadian news sources
|
||||||
|
author: Nick Redding
|
||||||
|
|
||||||
|
- title: gigitaljournal.com
|
||||||
|
author: Darko Miletic
|
||||||
|
|
||||||
|
- title: Pajamas Media
|
||||||
|
author: Krittika Goyal
|
||||||
|
|
||||||
|
- title: Algemeen Dagbla
|
||||||
|
author: kwetal
|
||||||
|
|
||||||
|
- title: "The Reader's Digest"
|
||||||
|
author: BrianG
|
||||||
|
|
||||||
|
- title: The Yemen Times
|
||||||
|
author: kwetal
|
||||||
|
|
||||||
|
- title: The Kitsap Sun
|
||||||
|
author: Darko Miletic
|
||||||
|
|
||||||
|
- title: drivelry.com
|
||||||
|
author: Krittika Goyal
|
||||||
|
|
||||||
|
- title: New recipe for Google Reader that downloads unread articles instead of just starred ones
|
||||||
|
author: rollercoaster
|
||||||
|
|
||||||
|
- title: Le Devoir
|
||||||
|
author: Lorenzo Vigentini
|
||||||
|
|
||||||
|
- title: Joop
|
||||||
|
author: kwetal
|
||||||
|
|
||||||
|
- title: Various computer magazines
|
||||||
|
author: Lorenzo Vigentini
|
||||||
|
|
||||||
|
- title: The Wall Street journal (free parts)
|
||||||
|
author: Nick Redding
|
||||||
|
|
||||||
|
- title: Journal of Nephrology
|
||||||
|
author: Krittika Goyal
|
||||||
|
|
||||||
|
- title: stuff.co.nz
|
||||||
|
author: Krittika Goyal
|
||||||
|
|
||||||
|
- title: Editor and Publisher
|
||||||
|
author: XanthanGum
|
||||||
|
|
||||||
|
- title: The Week (free)
|
||||||
|
author: Darko Miletic
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Physics Today
|
||||||
|
- Wall Street Journal
|
||||||
|
- American Spectator
|
||||||
|
- FTD
|
||||||
|
- The National Post
|
||||||
|
- Blic
|
||||||
|
- Ars Technica
|
||||||
|
|
||||||
|
|
||||||
- version: 0.6.34
|
- version: 0.6.34
|
||||||
date: 2010-01-15
|
date: 2010-01-15
|
||||||
|
|
||||||
|
BIN
icons/book.icns
Normal file
After Width: | Height: | Size: 11 KiB |
BIN
resources/catalog/DefaultCover.jpg
Normal file
After Width: | Height: | Size: 22 KiB |
BIN
resources/catalog/mastheadImage.gif
Normal file
After Width: | Height: | Size: 18 KiB |
73
resources/catalog/stylesheet.css
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
body { background-color: white; }
|
||||||
|
|
||||||
|
p.title {
|
||||||
|
margin-top:0em;
|
||||||
|
margin-bottom:1em;
|
||||||
|
text-align:center;
|
||||||
|
font-style:italic;
|
||||||
|
font-size:xx-large;
|
||||||
|
border-bottom: solid black 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.author {
|
||||||
|
margin-top:0em;
|
||||||
|
margin-bottom:0em;
|
||||||
|
text-align: left;
|
||||||
|
text-indent: 1em;
|
||||||
|
font-size:large;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.tags {
|
||||||
|
margin-top:0em;
|
||||||
|
margin-bottom:0em;
|
||||||
|
text-align: left;
|
||||||
|
text-indent: 1em;
|
||||||
|
font-size:small;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.description {
|
||||||
|
text-align:left;
|
||||||
|
font-style:italic;
|
||||||
|
margin-top: 0em;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.date_index {
|
||||||
|
font-size:x-large;
|
||||||
|
text-align:center;
|
||||||
|
font-weight:bold;
|
||||||
|
margin-top:1em;
|
||||||
|
margin-bottom:0px;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.letter_index {
|
||||||
|
font-size:x-large;
|
||||||
|
text-align:center;
|
||||||
|
font-weight:bold;
|
||||||
|
margin-top:1em;
|
||||||
|
margin-bottom:0px;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.author_index {
|
||||||
|
font-size:large;
|
||||||
|
text-align:left;
|
||||||
|
margin-top:0px;
|
||||||
|
margin-bottom:0px;
|
||||||
|
text-indent: 0em;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.read_book {
|
||||||
|
text-align:left;
|
||||||
|
margin-top:0px;
|
||||||
|
margin-bottom:0px;
|
||||||
|
margin-left:2em;
|
||||||
|
text-indent:-2em;
|
||||||
|
}
|
||||||
|
|
||||||
|
p.unread_book {
|
||||||
|
text-align:left;
|
||||||
|
margin-top:0px;
|
||||||
|
margin-bottom:0px;
|
||||||
|
margin-left:2em;
|
||||||
|
text-indent:-2em;
|
||||||
|
}
|
||||||
|
|
BIN
resources/images/news/greensboro_news_and_record.png
Normal file
After Width: | Height: | Size: 480 B |
BIN
resources/images/news/hotair.png
Normal file
After Width: | Height: | Size: 363 B |
BIN
resources/images/news/information_dk.png
Normal file
After Width: | Height: | Size: 343 B |
BIN
resources/images/news/ionline_pt.png
Normal file
After Width: | Height: | Size: 647 B |
BIN
resources/images/news/jp_dk.png
Normal file
After Width: | Height: | Size: 609 B |
BIN
resources/images/news/ledevoir.png
Normal file
After Width: | Height: | Size: 531 B |
BIN
resources/images/news/michellemalkin_icon.png
Normal file
After Width: | Height: | Size: 419 B |
BIN
resources/images/news/nationalreviewonline.png
Normal file
After Width: | Height: | Size: 815 B |
BIN
resources/images/news/neowin.png
Normal file
After Width: | Height: | Size: 1.0 KiB |
BIN
resources/images/news/nursingtimes.png
Normal file
After Width: | Height: | Size: 788 B |
BIN
resources/images/news/observer.png
Normal file
After Width: | Height: | Size: 835 B |
BIN
resources/images/news/politiken_dk.png
Normal file
After Width: | Height: | Size: 482 B |
BIN
resources/images/news/the_week_magazine_free.png
Normal file
After Width: | Height: | Size: 301 B |
@ -1,12 +1,12 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
arstechnica.com
|
arstechnica.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
|
|
||||||
class ArsTechnica2(BasicNewsRecipe):
|
class ArsTechnica2(BasicNewsRecipe):
|
||||||
title = u'Ars Technica'
|
title = u'Ars Technica'
|
||||||
@ -18,24 +18,24 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf8'
|
encoding = 'utf-8'
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
extra_css = ' body {font-family: sans-serif} .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none} '
|
||||||
|
|
||||||
extra_css = '''
|
conversion_options = {
|
||||||
.news-item-title{font-size: medium ;font-family:Arial,Helvetica,sans-serif; font-weight:bold;}
|
'comments' : description
|
||||||
.news-item-teaser{font-size: small ;font-family:Arial,Helvetica,sans-serif; font-weight:bold;}
|
,'tags' : category
|
||||||
.news-item-byline{font-size:xx-small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
|
,'language' : language
|
||||||
.news-item-text{font-size:x-small;font-family:Arial,Helvetica,sans-serif;}
|
,'publisher' : publisher
|
||||||
.news-item-figure-caption-text{font-size:xx-small; font-family:Arial,Helvetica,sans-serif;font-weight:bold;}
|
}
|
||||||
.news-item-figure-caption-byline{font-size:xx-small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':['news-item-info','news-item']})]
|
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['object','link','embed'])
|
dict(name=['object','link','embed'])
|
||||||
,dict(name='div', attrs={'class':'related-stories'})
|
,dict(name='div', attrs={'class':'read-more-link'})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -52,14 +52,19 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def append_page(self, soup, appendtag, position):
|
def append_page(self, soup, appendtag, position):
|
||||||
pager = soup.find('div',attrs={'id':'pager'})
|
pager = soup.find('div',attrs={'class':'pager'})
|
||||||
if pager:
|
if pager:
|
||||||
for atag in pager.findAll('a',href=True):
|
for atag in pager.findAll('a',href=True):
|
||||||
str = self.tag_to_string(atag)
|
str = self.tag_to_string(atag)
|
||||||
if str.startswith('Next'):
|
if str.startswith('Next'):
|
||||||
soup2 = self.index_to_soup(atag['href'])
|
nurl = 'http://arstechnica.com' + atag['href']
|
||||||
|
rawc = self.index_to_soup(nurl,True)
|
||||||
|
soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
|
||||||
|
|
||||||
texttag = soup2.find('div', attrs={'class':'news-item-text'})
|
readmoretag = soup2.find('div', attrs={'class':'read-more-link'})
|
||||||
|
if readmoretag:
|
||||||
|
readmoretag.extract()
|
||||||
|
texttag = soup2.find('div', attrs={'class':'body'})
|
||||||
for it in texttag.findAll(style=True):
|
for it in texttag.findAll(style=True):
|
||||||
del it['style']
|
del it['style']
|
||||||
|
|
||||||
@ -71,10 +76,12 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
ftag = soup.find('div', attrs={'class':'byline'})
|
||||||
ftag = soup.find('div', attrs={'class':'news-item-byline'})
|
|
||||||
if ftag:
|
if ftag:
|
||||||
ftag.insert(4,'<br /><br />')
|
brtag = Tag(soup,'br')
|
||||||
|
brtag2 = Tag(soup,'br')
|
||||||
|
ftag.insert(4,brtag)
|
||||||
|
ftag.insert(5,brtag2)
|
||||||
|
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
@ -83,5 +90,3 @@ class ArsTechnica2(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -98,6 +98,9 @@ class Barrons(BasicNewsRecipe):
|
|||||||
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
|
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
return article.get('link', None)
|
||||||
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover_url = None
|
cover_url = None
|
||||||
|
15
resources/recipes/cjr.recipe
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class CJR(BasicNewsRecipe):
|
||||||
|
title = u'Columbia Journalism Review'
|
||||||
|
__author__ = u'Xanthan Gum'
|
||||||
|
description = 'News about journalism.'
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
feeds = [(u'News Stories', u'http://www.cjr.org/index.xml')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?page=all&print=true'
|
@ -2,17 +2,37 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class CommonDreams(BasicNewsRecipe):
|
class CommonDreams(BasicNewsRecipe):
|
||||||
|
# Identify the recipe
|
||||||
|
|
||||||
title = u'Common Dreams'
|
title = u'Common Dreams'
|
||||||
description = u'Progressive news and views'
|
description = u'Progressive news and views'
|
||||||
__author__ = u'XanthanGum'
|
__author__ = u'XanthanGum'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
|
# Format the text
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
|
||||||
|
h1{font-size: xx-large;}
|
||||||
|
h2{font-size: large;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Pick no article older than seven days and limit the number of articles per feed to 100
|
||||||
|
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
feeds = [
|
# Remove everything before the article
|
||||||
(u'Common Dreams Headlines',
|
|
||||||
u'http://www.commondreams.org/feed/headlines_rss'),
|
remove_tags_before = dict(name = 'div', attrs = {'id':'node-header'})
|
||||||
(u'Common Dreams Views', u'http://www.commondreams.org/feed/views_rss'),
|
|
||||||
(u'Common Dreams Newswire', u'http://www.commondreams.org/feed/newswire_rss')
|
# Remove everything after the article
|
||||||
]
|
|
||||||
|
remove_tags_after = dict(name = 'div', attrs = {'class':'copyright-info'})
|
||||||
|
|
||||||
|
# Identify the news feeds
|
||||||
|
|
||||||
|
feeds = [(u'Headlines', u'http://www.commondreams.org/feed/headlines_rss'),
|
||||||
|
(u'Further News Articles', u'http://www.commondreams.org/feed/further_rss'),
|
||||||
|
(u'Views', u'http://www.commondreams.org/feed/views_rss'),
|
||||||
|
(u'Progressive Newswire', u'http://www.commondreams.org/feed/newswire_rss')]
|
||||||
|
@ -10,13 +10,31 @@ doscovermagazine.com
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class DiscoverMagazine(BasicNewsRecipe):
|
class DiscoverMagazine(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'Discover Magazine'
|
title = u'Discover Magazine'
|
||||||
description = u'Science, Technology and the Future'
|
description = u'Science, Technology and the Future'
|
||||||
__author__ = 'Mike Diaz'
|
__author__ = 'Mike Diaz'
|
||||||
oldest_article = 33
|
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
|
oldest_article = 33
|
||||||
max_articles_per_feed = 20
|
max_articles_per_feed = 20
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
|
||||||
|
|
||||||
|
remove_tags_before = dict(id='articlePage')
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'articlePage'})]
|
||||||
|
|
||||||
|
remove_tags = [dict(attrs={'id':['buttons', 'tool-box', 'teaser', 'already-subscriber', 'teaser-suite', 'related-articles', 'relatedItem', 'box-popular', 'box-blogs', 'box-news', 'footer']}),
|
||||||
|
dict(attrs={'class':'popularNewsBox'}),
|
||||||
|
dict(name=['img', 'style', 'head'])]
|
||||||
|
|
||||||
|
remove_tags_after = dict(id='articlePage')
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
|
(u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
|
||||||
(u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
|
(u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
|
||||||
|
@ -53,6 +53,8 @@ class Economist(BasicNewsRecipe):
|
|||||||
self.feed_dict.items()])
|
self.feed_dict.items()])
|
||||||
|
|
||||||
def eco_sort_sections(self, feeds):
|
def eco_sort_sections(self, feeds):
|
||||||
|
if not feeds:
|
||||||
|
raise ValueError('No new articles found')
|
||||||
order = {
|
order = {
|
||||||
'The World This Week': 1,
|
'The World This Week': 1,
|
||||||
'Leaders': 2,
|
'Leaders': 2,
|
||||||
|
34
resources/recipes/editor_and_publisher.recipe
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class EandP(BasicNewsRecipe):
|
||||||
|
title = u'Editor and Publisher'
|
||||||
|
__author__ = u'Xanthan Gum'
|
||||||
|
description = 'News about newspapers and journalism.'
|
||||||
|
language = 'en'
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
# Font formatting code borrowed from kwetal
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
|
||||||
|
h1{font-size: xx-large;}
|
||||||
|
h2{font-size: large;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Delete everything before the article
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='font', attrs={'class':'titlebar_black'})
|
||||||
|
|
||||||
|
# Delete everything after the article
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(r'<!--endclickprintinclude-->.*</body>', re.DOTALL|re.IGNORECASE),
|
||||||
|
lambda match: '</body>'),]
|
||||||
|
|
||||||
|
feeds = [(u'Breaking News', u'http://feeds.feedburner.com/EditorAndPublisher-BreakingNews'),
|
||||||
|
(u'Business News', u'http://feeds.feedburner.com/EditorAndPublisher-BusinessNews'),
|
||||||
|
(u'Newsroom', u'http://feeds.feedburner.com/EditorAndPublisher-Newsroom'),
|
||||||
|
(u'Technology News', u'http://feeds.feedburner.com/EditorAndPublisher-Technology'),
|
||||||
|
(u'Syndicates News', u'http://feeds.feedburner.com/EditorAndPublisher-Syndicates')]
|
67
resources/recipes/fr_online.recipe
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, Justus Bisser <justus.bisser at gmail.com>'
|
||||||
|
'''
|
||||||
|
fr-online.de
|
||||||
|
'''
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Spiegel_ger(BasicNewsRecipe):
|
||||||
|
title = 'Frankfurter Rundschau'
|
||||||
|
__author__ = 'Justus Bisser'
|
||||||
|
description = "Dies ist die Online-Ausgabe der Frankfurter Rundschau. Um die abgerufenen individuell einzustellen bearbeiten sie die Liste im erweiterten Modus. Die Feeds findet man auf http://www.fr-online.de/verlagsservice/fr_newsreader/?em_cnt=574255"
|
||||||
|
publisher = 'Druck- und Verlagshaus Frankfurt am Main GmbH'
|
||||||
|
category = 'FR Online, Frankfurter Rundschau, Nachrichten, News,Dienste, RSS, RSS, Feedreader, Newsfeed, iGoogle, Netvibes, Widget'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
language = 'de'
|
||||||
|
lang = 'de-DE'
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
#encoding = 'cp1252'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : lang
|
||||||
|
}
|
||||||
|
|
||||||
|
recursions = 0
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
#keep_only_tags = [dict(name='div', attrs={'class':'text'})]
|
||||||
|
#tags_remove = [dict(name='div', attrs={'style':'text-align: left; margin: 4px 0px 0px 4px; width: 200px; float: right;'})]
|
||||||
|
remove_attributes = ['style']
|
||||||
|
feeds = []
|
||||||
|
#remove_tags_before = [dict(name='div', attrs={'style':'padding-left: 0px;'})]
|
||||||
|
#remove_tags_after = [dict(name='div', attrs={'class':'box_head_text'})]
|
||||||
|
|
||||||
|
# enable for all news
|
||||||
|
allNews = 0
|
||||||
|
if allNews:
|
||||||
|
feeds = [(u'Frankfurter Rundschau', u'http://www.fr-online.de/rss/sport/index.xml')]
|
||||||
|
else:
|
||||||
|
#select the feeds you like
|
||||||
|
feeds = [(u'Nachrichten', u'http://www.fr-online.de/rss/politik/index.xml')]
|
||||||
|
feeds.append((u'Kommentare und Analysen', u'http://www.fr-online.de/rss/meinung/index.xml'))
|
||||||
|
feeds.append((u'Dokumentationen', u'http://www.fr-online.de/rss/dokumentation/index.xml'))
|
||||||
|
feeds.append((u'Deutschlandtrend', u'http://www.fr-online.de/rss/deutschlandtrend/index.xml'))
|
||||||
|
feeds.append((u'Wirtschaft', u'http://www.fr-online.de/rss/wirtschaft/index.xml'))
|
||||||
|
feeds.append((u'Sport', u'http://www.fr-online.de/rss/sport/index.xml'))
|
||||||
|
feeds.append((u'Feuilleton', u'http://www.fr-online.de/rss/feuilleton/index.xml'))
|
||||||
|
feeds.append((u'Panorama', u'http://www.fr-online.de/rss/panorama/index.xml'))
|
||||||
|
feeds.append((u'Rhein Main und Hessen', u'http://www.fr-online.de/rss/hessen/index.xml'))
|
||||||
|
feeds.append((u'Fitness und Gesundheit', u'http://www.fr-online.de/rss/fit/index.xml'))
|
||||||
|
feeds.append((u'Multimedia', u'http://www.fr-online.de/rss/multimedia/index.xml'))
|
||||||
|
feeds.append((u'Wissen und Bildung', u'http://www.fr-online.de/rss/wissen/index.xml'))
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
url = article.link
|
||||||
|
regex = re.compile("0C[0-9]{6,8}0A?")
|
||||||
|
|
||||||
|
liste = regex.findall(url)
|
||||||
|
string = liste.pop(0)
|
||||||
|
string = string[2:len(string)-1]
|
||||||
|
return "http://www.fr-online.de/_em_cms/_globals/print.php?em_cnt=" + string
|
||||||
|
|
@ -15,7 +15,7 @@ class FTDe(BasicNewsRecipe):
|
|||||||
__author__ = 'Oliver Niesner'
|
__author__ = 'Oliver Niesner'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
timefmt = ' [%d %b %Y]'
|
timefmt = ' [%d %b %Y]'
|
||||||
language = _('German')
|
language = 'de'
|
||||||
max_articles_per_feed = 40
|
max_articles_per_feed = 40
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
@ -23,6 +23,7 @@ class FTDe(BasicNewsRecipe):
|
|||||||
dict(id='topbanner'),
|
dict(id='topbanner'),
|
||||||
dict(id='seitenkopf'),
|
dict(id='seitenkopf'),
|
||||||
dict(id='BoxA-0-0-0'),
|
dict(id='BoxA-0-0-0'),
|
||||||
|
#dict(id='BoxA-2-0-0'),
|
||||||
dict(id='footer'),
|
dict(id='footer'),
|
||||||
dict(id='rating_open'),
|
dict(id='rating_open'),
|
||||||
dict(id='ADS_Top'),
|
dict(id='ADS_Top'),
|
||||||
@ -59,6 +60,7 @@ class FTDe(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'class':'relatedhalb'}),
|
dict(name='div', attrs={'class':'relatedhalb'}),
|
||||||
dict(name='div', attrs={'class':'box boxListScrollOutline'}),
|
dict(name='div', attrs={'class':'box boxListScrollOutline'}),
|
||||||
dict(name='div', attrs={'class':'box boxPhotoshow boxImgWide'}),
|
dict(name='div', attrs={'class':'box boxPhotoshow boxImgWide'}),
|
||||||
|
dict(name='div', attrs={'class':'box boxTeaser boxPhotoshow boxImgWide'}),
|
||||||
dict(name='div', attrs={'class':'box boxTeaser'}),
|
dict(name='div', attrs={'class':'box boxTeaser'}),
|
||||||
dict(name='div', attrs={'class':'tagCloud'}),
|
dict(name='div', attrs={'class':'tagCloud'}),
|
||||||
dict(name='div', attrs={'class':'pollView'}),
|
dict(name='div', attrs={'class':'pollView'}),
|
||||||
|
@ -32,7 +32,7 @@ class GlobeAndMail(BasicNewsRecipe):
|
|||||||
'gallery-controls', 'video', 'galleryLoading','deck','header',
|
'gallery-controls', 'video', 'galleryLoading','deck','header',
|
||||||
'toolsBottom'] },
|
'toolsBottom'] },
|
||||||
{'class':['credit','inline-img-caption','tab-pointer'] },
|
{'class':['credit','inline-img-caption','tab-pointer'] },
|
||||||
dict(name='div', attrs={'id':'lead-photo'}),
|
dict(name='div', attrs={'id':['lead-photo', 'most-popular-story']}),
|
||||||
dict(name='div', attrs={'class':'right'}),
|
dict(name='div', attrs={'class':'right'}),
|
||||||
dict(name='div', attrs={'id':'footer'}),
|
dict(name='div', attrs={'id':'footer'}),
|
||||||
dict(name='div', attrs={'id':'beta-msg'}),
|
dict(name='div', attrs={'id':'beta-msg'}),
|
||||||
@ -44,8 +44,9 @@ class GlobeAndMail(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'id':'blog-header'}),
|
dict(name='div', attrs={'id':'blog-header'}),
|
||||||
dict(name='div', attrs={'id':'right-rail'}),
|
dict(name='div', attrs={'id':'right-rail'}),
|
||||||
dict(name='div', attrs={'id':'group-footer-container'}),
|
dict(name='div', attrs={'id':'group-footer-container'}),
|
||||||
dict(name=['iframe'])
|
dict(name=['iframe', 'style'])
|
||||||
]
|
]
|
||||||
|
remove_attributes = ['style']
|
||||||
remove_tags_after = [{'id':['article-content']},
|
remove_tags_after = [{'id':['article-content']},
|
||||||
{'class':['pull','inline-img'] },
|
{'class':['pull','inline-img'] },
|
||||||
dict(name='img', attrs={'class':'inline-media-embed'}),
|
dict(name='img', attrs={'class':'inline-media-embed'}),
|
||||||
|
54
resources/recipes/greensboro_news_and_record.recipe
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Walt Anthony <workshop.northpole at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.news-record.com
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NewsandRecord(BasicNewsRecipe):
|
||||||
|
title = u'Greensboro News & Record'
|
||||||
|
description = "News from Greensboro, North Carolina"
|
||||||
|
__author__ = 'Walt Anthony'
|
||||||
|
publisher = 'News & Record and Landmark Media Enterprises, LLC'
|
||||||
|
category = 'news, USA'
|
||||||
|
oldest_article = 3 #days
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
summary_length = 150
|
||||||
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='h3', attrs={'class':'nrcTxt_headline'})
|
||||||
|
remove_tags_after = dict(name='div', attrs={'id':'nrcBlk_ContentBody'})
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='iframe'),
|
||||||
|
dict(name=['notags','embed','object','link','img']),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('News', 'http://www.news-record.com/news/archive/feed'),
|
||||||
|
('Greensboro News', 'http://www.news-record.com/news/greensboro/feed'),
|
||||||
|
('Education', 'http://www.news-record.com/news/education/feed'),
|
||||||
|
('Government', 'http://www.news-record.com/news/government/feed'),
|
||||||
|
('College Sports', 'http://www.news-record.com/sports/college/feed'),
|
||||||
|
('Sports Extra', 'http://www.news-record.com/blog/sportsextra/feed'),
|
||||||
|
('Life', 'http://www.news-record.com/life/top/feed'),
|
||||||
|
('NASCAR', 'http://www.news-record.com/sports/nascar/top/feed'),
|
||||||
|
('Editorials', 'http://www.news-record.com/opinion/editorials/feed'),
|
||||||
|
('Letters to the Editor', 'http://www.news-record.com/opinion/letters/feed')
|
||||||
|
]
|
||||||
|
|
197
resources/recipes/hbr_blogs.recipe
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Needed for BLOGs
|
||||||
|
from calibre.web.feeds import Feed
|
||||||
|
|
||||||
|
class HBR(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Harvard Business Review Blogs'
|
||||||
|
description = 'To subscribe go to http://hbr.harvardbusiness.org'
|
||||||
|
needs_subscription = True
|
||||||
|
__author__ = 'Kovid Goyal and Sujata Raman, enhanced by BrianG'
|
||||||
|
language = 'en'
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
LOGIN_URL = 'http://hbr.org/login?request_url=/'
|
||||||
|
INDEX = 'http://hbr.org/current'
|
||||||
|
|
||||||
|
#
|
||||||
|
# Blog Stuff
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
INCLUDE_BLOGS = True
|
||||||
|
INCLUDE_ARTICLES = False
|
||||||
|
|
||||||
|
# option-specific settings.
|
||||||
|
|
||||||
|
if INCLUDE_BLOGS == True:
|
||||||
|
remove_tags_after = dict(id='articleBody')
|
||||||
|
remove_tags_before = dict(id='pageFeature')
|
||||||
|
feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')]
|
||||||
|
oldest_article = 30
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
else:
|
||||||
|
timefmt = ' [%B %Y]'
|
||||||
|
|
||||||
|
|
||||||
|
keep_only_tags = [ dict(name='div', id='pageContainer')
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
|
||||||
|
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
|
||||||
|
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
|
||||||
|
'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD',
|
||||||
|
'mailingListTout', 'partnerCenter', 'pageFooter']),
|
||||||
|
dict(name='iframe')]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; }
|
||||||
|
.article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
|
||||||
|
h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; }
|
||||||
|
h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; }
|
||||||
|
#articleBody{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;}
|
||||||
|
#summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;}
|
||||||
|
'''
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
|
br.open(self.LOGIN_URL)
|
||||||
|
br.select_form(name='signInForm')
|
||||||
|
br['signInForm:username'] = self.username
|
||||||
|
br['signInForm:password'] = self.password
|
||||||
|
raw = br.submit().read()
|
||||||
|
if 'My Account' not in raw:
|
||||||
|
raise Exception('Failed to login, are you sure your username and password are correct?')
|
||||||
|
self.logout_url = None
|
||||||
|
link = br.find_link(text='Sign out')
|
||||||
|
if link:
|
||||||
|
self.logout_url = link.absolute_url
|
||||||
|
return br
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
def cleanup(self):
|
||||||
|
if self.logout_url is not None:
|
||||||
|
self.browser.open(self.logout_url)
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
def map_url(self, url):
|
||||||
|
if url.endswith('/ar/1'):
|
||||||
|
return url[:-1]+'pr'
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def hbr_get_toc(self):
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href')
|
||||||
|
return self.index_to_soup('http://hbr.org'+url)
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def hbr_parse_section(self, container, feeds):
|
||||||
|
current_section = None
|
||||||
|
current_articles = []
|
||||||
|
for x in container.findAll(name=['li', 'h3', 'h4']):
|
||||||
|
if x.name in ['h3', 'h4'] and not x.findAll(True):
|
||||||
|
if current_section and current_articles:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
current_section = self.tag_to_string(x)
|
||||||
|
current_articles = []
|
||||||
|
self.log('\tFound section:', current_section)
|
||||||
|
if x.name == 'li':
|
||||||
|
a = x.find('a', href=True)
|
||||||
|
if a is not None:
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
url = a.get('href')
|
||||||
|
if '/ar/' not in url:
|
||||||
|
continue
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'http://hbr.org'+url
|
||||||
|
url = self.map_url(url)
|
||||||
|
p = x.find('p')
|
||||||
|
desc = ''
|
||||||
|
if p is not None:
|
||||||
|
desc = self.tag_to_string(p)
|
||||||
|
if not title or not url:
|
||||||
|
continue
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
self.log('\t\t\t', desc)
|
||||||
|
current_articles.append({'title':title, 'url':url,
|
||||||
|
'description':desc, 'date':''})
|
||||||
|
if current_section and current_articles:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def hbr_parse_toc(self, soup):
|
||||||
|
feeds = []
|
||||||
|
features = soup.find(id='issueFeaturesContent')
|
||||||
|
self.hbr_parse_section(features, feeds)
|
||||||
|
departments = soup.find(id='issueDepartments')
|
||||||
|
self.hbr_parse_section(departments, feeds)
|
||||||
|
return feeds
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
def feed_to_index_append(self, feedObject, masterFeed):
|
||||||
|
# Loop thru the feed object and build the correct type of article list
|
||||||
|
for feed in feedObject:
|
||||||
|
# build the correct structure from the feed object
|
||||||
|
newArticles = []
|
||||||
|
for article in feed.articles:
|
||||||
|
newArt = {
|
||||||
|
'title' : article.title,
|
||||||
|
'url' : article.url,
|
||||||
|
'date' : article.date,
|
||||||
|
'description' : article.text_summary
|
||||||
|
}
|
||||||
|
newArticles.append(newArt)
|
||||||
|
|
||||||
|
# Append the earliest/latest dates of the feed to the feed title
|
||||||
|
startDate, endDate = self.get_feed_dates(feed, '%d-%b')
|
||||||
|
newFeedTitle = feed.title + ' (' + startDate + ' thru ' + endDate + ')'
|
||||||
|
|
||||||
|
# append the newly-built list object to the index object passed in
|
||||||
|
# as masterFeed.
|
||||||
|
masterFeed.append( (newFeedTitle,newArticles) )
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
def get_feed_dates(self, feedObject, dateMask):
|
||||||
|
startDate = feedObject.articles[len(feedObject.articles)-1].localtime.strftime(dateMask)
|
||||||
|
endDate = feedObject.articles[0].localtime.strftime(dateMask)
|
||||||
|
|
||||||
|
return startDate, endDate
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
def hbr_parse_blogs(self, feeds):
|
||||||
|
# Do the "official" parse_feeds first
|
||||||
|
rssFeeds = Feed()
|
||||||
|
|
||||||
|
# Use the PARSE_FEEDS method to get a Feeds object of the articles
|
||||||
|
rssFeeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
|
|
||||||
|
# Create a new feed of the right configuration and append to existing afeeds
|
||||||
|
self.feed_to_index_append(rssFeeds[:], feeds)
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
def parse_index(self):
|
||||||
|
if self.INCLUDE_ARTICLES == True:
|
||||||
|
soup = self.hbr_get_toc()
|
||||||
|
feeds = self.hbr_parse_toc(soup)
|
||||||
|
else:
|
||||||
|
feeds = []
|
||||||
|
|
||||||
|
# blog stuff
|
||||||
|
if self.INCLUDE_BLOGS == True:
|
||||||
|
self.hbr_parse_blogs(feeds)
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
#-------------------------------------------------------------------------------------------------
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
index = 'http://hbr.org/current'
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
link_item = soup.find('img', alt=re.compile("Current Issue"), src=True)
|
||||||
|
|
||||||
|
if link_item:
|
||||||
|
cover_url = 'http://hbr.org' + link_item['src']
|
||||||
|
|
||||||
|
return cover_url
|
50
resources/recipes/heraldo.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Lorenzo Vigentini'
|
||||||
|
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||||
|
__description__ = 'Daily newspaper from Aragon'
|
||||||
|
__version__ = 'v1.01'
|
||||||
|
__date__ = '30, January 2010'
|
||||||
|
|
||||||
|
'''
|
||||||
|
http://www.heraldo.es/
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class heraldo(BasicNewsRecipe):
|
||||||
|
author = 'Lorenzo Vigentini'
|
||||||
|
description = 'Daily newspaper from Aragon'
|
||||||
|
|
||||||
|
cover_url = 'http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo.gif'
|
||||||
|
title = u'Heraldo de Aragon'
|
||||||
|
publisher = 'OJD Nielsen'
|
||||||
|
category = 'News, politics, culture, economy, general interest'
|
||||||
|
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 10
|
||||||
|
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':['titularNoticiaNN','textoGrisVerdanaContenidos']})
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Portadas ', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
|
||||||
|
]
|
||||||
|
extra_css = '''
|
||||||
|
.articledate {color: gray;font-family: monospace;}
|
||||||
|
.articledescription {display: block;font-family: sans;font-size: 0.7em; text-indent: 0;}
|
||||||
|
.firma {color: #666;display: block;font-family: verdana, arial, helvetica;font-size: 1em;margin-bottom: 8px;}
|
||||||
|
.textoGrisVerdanaContenidos {color: #56595c;display: block;font-family: Verdana;font-size: 1.28571em;padding-bottom: 10px}
|
||||||
|
.titularNoticiaNN {display: block;padding-bottom: 10px;padding-left: 0;padding-right: 0;padding-top: 4px}
|
||||||
|
.titulo {color: #003066;font-family: Tahoma;font-size: 1.92857em;font-weight: bold;line-height: 1.2em}
|
||||||
|
'''
|
41
resources/recipes/hotair.recipe
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Walt Anthony <workshop.northpole at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.hotair.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class hotair(BasicNewsRecipe):
|
||||||
|
title = u'Hot Air'
|
||||||
|
__author__ = 'Walt Anthony'
|
||||||
|
description = "The world's first, full-service conservative Internet broadcast network"
|
||||||
|
publisher = 'Hot Air'
|
||||||
|
category = 'news, politics, USA'
|
||||||
|
oldest_article = 3
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
summary_length = 150
|
||||||
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'page-post'})]
|
||||||
|
|
||||||
|
remove_tags = [dict(name=['iframe', 'small', 'embed', 'object','link','script','form'])]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Hot Air', 'http://feeds.feedburner.com/hotair/main'),
|
||||||
|
('The Greenroom', 'http://feeds2.feedburner.com/hotair/greenroom')
|
||||||
|
]
|
50
resources/recipes/information_dk.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
information.dk
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Information_dk(BasicNewsRecipe):
|
||||||
|
title = 'Information - Denmark'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'News from Denmark'
|
||||||
|
publisher = 'information.dk'
|
||||||
|
category = 'news, politics, Denmark'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf8'
|
||||||
|
language = 'da'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Nyheder fra' , u'http://www.information.dk/feed')
|
||||||
|
,(u'Bedst lige nu' , u'http://www.information.dk/bedstligenu/feed')
|
||||||
|
,(u'Politik og internationalt' , u'http://www.information.dk/politik/feed')
|
||||||
|
,(u'Kunst og kultur' , u'http://www.information.dk/kultur/feed')
|
||||||
|
,(u'Moderne Tider' , u'http://www.information.dk/modernetider/feed')
|
||||||
|
,(u'Klima' , u'http://www.information.dk/klima/feed')
|
||||||
|
,(u'Opinion' , u'http://www.information.dk/opinion/feed')
|
||||||
|
,(u'Literatur' , u'http://www.information.dk/litteratur/feed')
|
||||||
|
,(u'Film' , u'http://www.information.dk/film/feed')
|
||||||
|
,(u'Kunst' , u'http://www.information.dk/kunst/feed')
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='h1',attrs={'class':'print-title'})
|
||||||
|
remove_tags_after = dict(name='div',attrs={'class':'print-footer'})
|
||||||
|
remove_tags = [dict(name=['object','link'])]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('information.dk/','information.dk/print/')
|
||||||
|
|
58
resources/recipes/ionline_pt.recipe
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.ionline.pt
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class IOnline_pt(BasicNewsRecipe):
|
||||||
|
title = 'ionline - Portugal'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'News from Portugal'
|
||||||
|
publisher = 'ionline.pt'
|
||||||
|
category = 'ionline, noticias, portugal, jornal, actualidade, benfica, bolsa, desporto, empresas, globo, europa, futebol, internacional, investir, lisboa, jogos, musica, videos, tempo, meteorologia, pais, politica, porto, sporting, fcporto, televisao, tv, opiniao, nacional, sociedade, crise, financeira, policia, crime, artes, cinema, cultura, madeleine, blog, ciencia, tecnologia, galerias, fotografia, fotos, famosos, emprego, imagens, teatro, news, mundial, governo, ps, psd, be, pcp, cds, pp, partidos'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'pt'
|
||||||
|
extra_css = ' .publish{font-style: italic; line-height: 1.2em; border-bottom: 1px dotted; padding: 5px 0} .entity{line-height: 1.2em} .overview{line-height:1.2em} '
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name=['h5','h1'])
|
||||||
|
, dict(name='div', attrs={'class':['publish','overview','entity']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','embed','iframe'])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Portugal' , u'http://www.ionline.pt/rss/portugal.xml' )
|
||||||
|
,(u'Mundo' , u'http://www.ionline.pt/rss/mundo.xml' )
|
||||||
|
,(u'Dinheiro' , u'http://www.ionline.pt/rss/dinheiro.xml' )
|
||||||
|
,(u'Desporto' , u'http://www.ionline.pt/rss/desporto.xml' )
|
||||||
|
,(u'Boa Vida' , u'http://www.ionline.pt/rss/boavida.xml' )
|
||||||
|
,(u'iReporter', u'http://www.ionline.pt/rss/ireporter.xml')
|
||||||
|
,(u'iBloges' , u'http://www.ionline.pt/rss/iblogues.xml' )
|
||||||
|
,(u'Desporto' , u'http://www.ionline.pt/rss/desporto.xml' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
rest = url.rpartition('/')[2]
|
||||||
|
lmain = rest.partition('-')[0]
|
||||||
|
lurl = u'http://www.ionline.pt/interior/index.php?p=news-print&idNota=' + lmain
|
||||||
|
return lurl
|
||||||
|
|
||||||
|
|
@ -11,7 +11,7 @@ class IrishTimes(BasicNewsRecipe):
|
|||||||
title = u'The Irish Times'
|
title = u'The Irish Times'
|
||||||
__author__ = "Derry FitzGerald, Ray Kinsella and David O'Callaghan"
|
__author__ = "Derry FitzGerald, Ray Kinsella and David O'Callaghan"
|
||||||
language = 'en'
|
language = 'en'
|
||||||
timefmt = ' (%A, %B %e, %Y)'
|
timefmt = ' (%A, %B %d, %Y)'
|
||||||
|
|
||||||
|
|
||||||
oldest_article = 3
|
oldest_article = 3
|
||||||
|
50
resources/recipes/jp_dk.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
jp.dk
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class JP_dk(BasicNewsRecipe):
|
||||||
|
title = 'Jyllands-Posten'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'News from Denmark'
|
||||||
|
publisher = 'jp.dk'
|
||||||
|
category = 'news, politics, Denmark'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'cp1252'
|
||||||
|
language = 'da'
|
||||||
|
|
||||||
|
extra_css = ' body{font-family: Arial,Verdana,Helvetica,Geneva,sans-serif } h1{font-family: Times,Georgia,Verdana,serif } '
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Tophistorier', u'http://www.jp.dk/rss/topnyheder.jsp')
|
||||||
|
,(u'Seneste nyt' , u'http://jp.dk/index.jsp?service=rssfeed&submode=seneste')
|
||||||
|
,(u'Indland' , u'http://www.jp.dk/rss/indland.jsp')
|
||||||
|
,(u'Udland' , u'http://www.jp.dk/rss/udland.jsp')
|
||||||
|
,(u'Ny viden' , u'http://www.jp.dk/rss/nyviden.jsp')
|
||||||
|
,(u'Timeout' , u'http://www.jp.dk/rss/timeout.jsp')
|
||||||
|
,(u'Kultur' , u'http://www.jp.dk/rss/kultur.jsp')
|
||||||
|
,(u'Sport' , u'http://www.jp.dk/rss/sport.jsp')
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link'])
|
||||||
|
,dict(name='p',attrs={'class':'artByline'})
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?service=printversion'
|
||||||
|
|
@ -1,4 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
@ -8,6 +11,7 @@ class JASN(BasicNewsRecipe):
|
|||||||
__author__ = 'Krittika Goyal'
|
__author__ = 'Krittika Goyal'
|
||||||
oldest_article = 31 #days
|
oldest_article = 31 #days
|
||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
|
delay = 5
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
INDEX = 'http://jasn.asnjournals.org/current.shtml'
|
INDEX = 'http://jasn.asnjournals.org/current.shtml'
|
||||||
@ -15,13 +19,13 @@ class JASN(BasicNewsRecipe):
|
|||||||
remove_tags_before = dict(name='h2')
|
remove_tags_before = dict(name='h2')
|
||||||
#remove_tags_after = dict(name='th', attrs={'align':'left'})
|
#remove_tags_after = dict(name='th', attrs={'align':'left'})
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='iframe'),
|
dict(name='iframe'),
|
||||||
#dict(name='div', attrs={'class':'related-articles'}),
|
#dict(name='div', attrs={'class':'related-articles'}),
|
||||||
dict(name='td', attrs={'id':['jasnFooter']}),
|
dict(name='td', attrs={'id':['jasnFooter']}),
|
||||||
dict(name='table', attrs={'id':"jasnNavBar"}),
|
dict(name='table', attrs={'id':"jasnNavBar"}),
|
||||||
dict(name='table', attrs={'class':'content_box_outer_table'}),
|
dict(name='table', attrs={'class':'content_box_outer_table'}),
|
||||||
dict(name='th', attrs={'align':'left'})
|
dict(name='th', attrs={'align':'left'})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -45,10 +49,52 @@ class JASN(BasicNewsRecipe):
|
|||||||
raise ValueError('Failed to log in, is your account expired?')
|
raise ValueError('Failed to log in, is your account expired?')
|
||||||
return br
|
return br
|
||||||
|
|
||||||
feeds = [
|
#feeds = [
|
||||||
('JASN',
|
#('JASN',
|
||||||
'http://jasn.asnjournals.org/rss/current.xml'),
|
#'http://jasn.asnjournals.org/rss/current.xml'),
|
||||||
]
|
#]
|
||||||
|
|
||||||
|
|
||||||
|
#TO GET ARTICLE TOC
|
||||||
|
def jasn_get_index(self):
|
||||||
|
return self.index_to_soup('http://jasn.asnjournals.org/current.shtml')
|
||||||
|
|
||||||
|
# To parse artice toc
|
||||||
|
def parse_index(self):
|
||||||
|
parse_soup = self.jasn_get_index()
|
||||||
|
|
||||||
|
div = parse_soup.find(id='tocBody')
|
||||||
|
|
||||||
|
current_section = None
|
||||||
|
current_articles = []
|
||||||
|
feeds = []
|
||||||
|
for x in div.findAll(True):
|
||||||
|
if x.name == 'h2':
|
||||||
|
# Section heading found
|
||||||
|
if current_articles and current_section:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
current_section = self.tag_to_string(x)
|
||||||
|
current_articles = []
|
||||||
|
self.log('\tFound section:', current_section)
|
||||||
|
if current_section is not None and x.name == 'strong':
|
||||||
|
title = self.tag_to_string(x)
|
||||||
|
a = x.parent.parent.find('a', href=lambda x: x and '/full/' in x)
|
||||||
|
if a is None:
|
||||||
|
continue
|
||||||
|
url = a.get('href', False)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'http://jasn.asnjournals.org'+url
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
current_articles.append({'title': title, 'url':url,
|
||||||
|
'description':'', 'date':''})
|
||||||
|
|
||||||
|
if current_articles and current_section:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -59,10 +105,18 @@ class JASN(BasicNewsRecipe):
|
|||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
url = 'http://jasn.asnjournals.org/'+url
|
url = 'http://jasn.asnjournals.org'+url
|
||||||
isoup = self.index_to_soup(url)
|
img = isoup = None
|
||||||
img = isoup.find('img', src=lambda x: x and
|
try:
|
||||||
x.startswith('/content/'))
|
isoup = self.index_to_soup(url)
|
||||||
|
except:
|
||||||
|
time.sleep(5)
|
||||||
|
try:
|
||||||
|
isoup = self.index_to_soup(url)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
img = isoup.find('img', src=lambda x: x and x.startswith('/content/'))
|
||||||
|
|
||||||
if img is not None:
|
if img is not None:
|
||||||
img.extract()
|
img.extract()
|
||||||
table = a.findParent('table')
|
table = a.findParent('table')
|
||||||
@ -71,3 +125,4 @@ class JASN(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
239
resources/recipes/macleans.recipe
Normal file
@ -0,0 +1,239 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
macleans.ca
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
|
from datetime import timedelta, date
|
||||||
|
|
||||||
|
class Macleans(BasicNewsRecipe):
|
||||||
|
title = u'Macleans Magazine'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
language = 'en_CA'
|
||||||
|
description = ('Macleans Magazine')
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
|
||||||
|
# customization notes: delete sections you are not interested in
|
||||||
|
# set oldest_article to the maximum number of days back from today to include articles
|
||||||
|
sectionlist = [
|
||||||
|
['http://www2.macleans.ca/','Front Page'],
|
||||||
|
['http://www2.macleans.ca/category/canada/','Canada'],
|
||||||
|
['http://www2.macleans.ca/category/world-from-the-magazine/','World'],
|
||||||
|
['http://www2.macleans.ca/category/business','Business'],
|
||||||
|
['http://www2.macleans.ca/category/arts-culture/','Culture'],
|
||||||
|
['http://www2.macleans.ca/category/opinion','Opinion'],
|
||||||
|
['http://www2.macleans.ca/category/health-from-the-magazine/','Health'],
|
||||||
|
['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'],
|
||||||
|
['http://www2.macleans.ca/category/education/','On Campus'],
|
||||||
|
['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel']
|
||||||
|
]
|
||||||
|
oldest_article = 7
|
||||||
|
|
||||||
|
# formatting for print version of articles
|
||||||
|
extra_css = '''h2{font-family:Times,serif; font-size:large;}
|
||||||
|
small {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
# tag handling for print version of articles
|
||||||
|
keep_only_tags = [dict(id='tw-print')]
|
||||||
|
remove_tags = [dict({'class':'postmetadata'})]
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
for img_tag in soup.findAll('img'):
|
||||||
|
parent_tag = img_tag.parent
|
||||||
|
if parent_tag.name == 'a':
|
||||||
|
new_tag = Tag(soup,'p')
|
||||||
|
new_tag.insert(0,img_tag)
|
||||||
|
parent_tag.replaceWith(new_tag)
|
||||||
|
elif parent_tag.name == 'p':
|
||||||
|
if not self.tag_to_string(parent_tag) == '':
|
||||||
|
new_div = Tag(soup,'div')
|
||||||
|
new_tag = Tag(soup,'p')
|
||||||
|
new_tag.insert(0,img_tag)
|
||||||
|
parent_tag.replaceWith(new_div)
|
||||||
|
new_div.insert(0,new_tag)
|
||||||
|
new_div.insert(1,parent_tag)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = None
|
||||||
|
ans = []
|
||||||
|
|
||||||
|
def parse_index_page(page_url,page_title):
|
||||||
|
|
||||||
|
def decode_date(datestr):
|
||||||
|
dmysplit = datestr.strip().lower().split(',')
|
||||||
|
mdsplit = dmysplit[1].split()
|
||||||
|
m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(mdsplit[0])+1
|
||||||
|
d = int(mdsplit[1])
|
||||||
|
y = int(dmysplit[2].split()[0])
|
||||||
|
return date(y,m,d)
|
||||||
|
|
||||||
|
def article_title(tag):
|
||||||
|
atag = tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
return ''
|
||||||
|
return self.tag_to_string(atag)
|
||||||
|
|
||||||
|
def article_url(tag):
|
||||||
|
atag = tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
return ''
|
||||||
|
return atag['href']+'print/'
|
||||||
|
|
||||||
|
def article_description(tag):
|
||||||
|
for p_tag in tag.findAll('p'):
|
||||||
|
d = self.tag_to_string(p_tag,False)
|
||||||
|
if not d == '':
|
||||||
|
return d
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def compound_h4_h3_title(tag):
|
||||||
|
if tag.h4:
|
||||||
|
if tag.h3:
|
||||||
|
return self.tag_to_string(tag.h4,False)+u'\u2014'+self.tag_to_string(tag.h3,False)
|
||||||
|
else:
|
||||||
|
return self.tag_to_string(tag.h4,False)
|
||||||
|
elif tag.h3:
|
||||||
|
return self.tag_to_string(tag.h3,False)
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def compound_h2_h4_title(tag):
|
||||||
|
if tag.h2:
|
||||||
|
if tag.h4:
|
||||||
|
return self.tag_to_string(tag.h2,False)+u'\u2014'+self.tag_to_string(tag.h4,False)
|
||||||
|
else:
|
||||||
|
return self.tag_to_string(tag.h2,False)
|
||||||
|
elif tag.h4:
|
||||||
|
return self.tag_to_string(tag.h4,False)
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def handle_article(header_tag, outer_tag):
|
||||||
|
if header_tag:
|
||||||
|
url = article_url(header_tag)
|
||||||
|
title = article_title(header_tag)
|
||||||
|
author_date_tag = outer_tag.h4
|
||||||
|
if author_date_tag:
|
||||||
|
author_date = self.tag_to_string(author_date_tag,False).split(' - ')
|
||||||
|
author = author_date[0].strip()
|
||||||
|
article_date = decode_date(author_date[1])
|
||||||
|
earliest_date = date.today() - timedelta(days=self.oldest_article)
|
||||||
|
if article_date < earliest_date:
|
||||||
|
self.log("Skipping article dated %s" % author_date[1])
|
||||||
|
else:
|
||||||
|
excerpt_div = outer_tag.find('div','excerpt')
|
||||||
|
if excerpt_div:
|
||||||
|
description = article_description(excerpt_div)
|
||||||
|
else:
|
||||||
|
description = ''
|
||||||
|
if not articles.has_key(page_title):
|
||||||
|
articles[page_title] = []
|
||||||
|
articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content=''))
|
||||||
|
|
||||||
|
def handle_category_article(cat, header_tag, outer_tag):
|
||||||
|
url = article_url(header_tag)
|
||||||
|
title = article_title(header_tag)
|
||||||
|
if not title == '':
|
||||||
|
title = cat+u'\u2014'+title
|
||||||
|
a_tag = outer_tag.find('span','authorLink')
|
||||||
|
if a_tag:
|
||||||
|
author = self.tag_to_string(a_tag,False)
|
||||||
|
a_tag.parent.extract()
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
description = article_description(outer_tag)
|
||||||
|
if not articles.has_key(page_title):
|
||||||
|
articles[page_title] = []
|
||||||
|
articles[page_title].append(dict(title=title,url=url,date='',description=description,author=author,content=''))
|
||||||
|
|
||||||
|
|
||||||
|
soup = self.index_to_soup(page_url)
|
||||||
|
|
||||||
|
if page_title == 'Front Page':
|
||||||
|
# special processing for the front page
|
||||||
|
top_stories = soup.find('div',{ "id" : "macleansFeatured" })
|
||||||
|
if top_stories:
|
||||||
|
for div_slide in top_stories.findAll('div','slide'):
|
||||||
|
url = article_url(div_slide)
|
||||||
|
div_title = div_slide.find('div','header')
|
||||||
|
if div_title:
|
||||||
|
title = self.tag_to_string(div_title,False)
|
||||||
|
else:
|
||||||
|
title = ''
|
||||||
|
description = article_description(div_slide)
|
||||||
|
if not articles.has_key(page_title):
|
||||||
|
articles[page_title] = []
|
||||||
|
articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||||
|
|
||||||
|
from_macleans = soup.find('div',{ "id" : "fromMacleans" })
|
||||||
|
if from_macleans:
|
||||||
|
for li_tag in from_macleans.findAll('li','fromMacleansArticle'):
|
||||||
|
title = compound_h4_h3_title(li_tag)
|
||||||
|
url = article_url(li_tag)
|
||||||
|
description = article_description(li_tag)
|
||||||
|
if not articles.has_key(page_title):
|
||||||
|
articles[page_title] = []
|
||||||
|
articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||||
|
|
||||||
|
blog_central = soup.find('div',{ "id" : "bloglist" })
|
||||||
|
if blog_central:
|
||||||
|
for li_tag in blog_central.findAll('li'):
|
||||||
|
title = compound_h2_h4_title(li_tag)
|
||||||
|
if li_tag.h4:
|
||||||
|
url = article_url(li_tag.h4)
|
||||||
|
if not articles.has_key(page_title):
|
||||||
|
articles[page_title] = []
|
||||||
|
articles[page_title].append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||||
|
|
||||||
|
# need_to_know = soup.find('div',{ "id" : "needToKnow" })
|
||||||
|
# if need_to_know:
|
||||||
|
# for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}):
|
||||||
|
# title = compound_h4_h3_title(div_tag)
|
||||||
|
# url = article_url(div_tag)
|
||||||
|
# description = article_description(div_tag)
|
||||||
|
# if not articles.has_key(page_title):
|
||||||
|
# articles[page_title] = []
|
||||||
|
# articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||||
|
|
||||||
|
for news_category in soup.findAll('div','newsCategory'):
|
||||||
|
news_cat = self.tag_to_string(news_category.h4,False)
|
||||||
|
handle_category_article(news_cat, news_category.find('h2'), news_category.find('div'))
|
||||||
|
for news_item in news_category.findAll('li'):
|
||||||
|
handle_category_article(news_cat,news_item.h3,news_item)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
# find the div containing the highlight article
|
||||||
|
div_post = soup.find('div','post')
|
||||||
|
if div_post:
|
||||||
|
h1_tag = div_post.h1
|
||||||
|
handle_article(h1_tag,div_post)
|
||||||
|
|
||||||
|
# find the divs containing the rest of the articles
|
||||||
|
div_other = div_post.find('div', { "id" : "categoryOtherPosts" })
|
||||||
|
if div_other:
|
||||||
|
for div_entry in div_other.findAll('div','entry'):
|
||||||
|
h2_tag = div_entry.h2
|
||||||
|
handle_article(h2_tag,div_entry)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for page_name,page_title in self.sectionlist:
|
||||||
|
parse_index_page(page_name,page_title)
|
||||||
|
ans.append(page_title)
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
24
resources/recipes/metro_montreal.recipe
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Metro_Montreal(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'M\xe9tro Montr\xe9al'
|
||||||
|
__author__ = 'Jerry Clapperton'
|
||||||
|
description = u'Le quotidien le plus branch\xe9 sur le monde'
|
||||||
|
language = 'fr'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
|
||||||
|
|
||||||
|
remove_tags = [dict(attrs={'id':'buttons'}), dict(name=['img', 'style'])]
|
||||||
|
|
||||||
|
feeds = [(u"L'info", u'http://journalmetro.com/linfo/rss'), (u'Monde', u'http://journalmetro.com/monde/rss'), (u'Culture', u'http://journalmetro.com/culture/rss'), (u'Sports', u'http://journalmetro.com/sports/rss'), (u'Paroles', u'http://journalmetro.com/paroles/rss')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('article', 'ArticlePrint') + '?language=fr'
|
49
resources/recipes/michellemalkin.recipe
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Walt Anthony <workshop.northpole at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.michellemalkin.com
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class MichelleMalkin(BasicNewsRecipe):
|
||||||
|
title = u'Michelle Malkin'
|
||||||
|
description = "Michelle Malkin's take on events, a mother, wife, blogger, conservative syndicated columnist, author, and Fox News Channel contributor."
|
||||||
|
__author__ = 'Walt Anthony'
|
||||||
|
publisher = 'Michelle Malkin LLC'
|
||||||
|
category = 'news, politics, USA'
|
||||||
|
oldest_article = 7 #days
|
||||||
|
max_articles_per_feed = 50
|
||||||
|
summary_length = 150
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
, 'linearize_tables' : True
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':'article'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['iframe', 'embed', 'object']),
|
||||||
|
dict(name='div', attrs={'id':['comments', 'commentForm']}),
|
||||||
|
dict(name='div', attrs={'class':['postCategories', 'comments', 'blogInfo', 'postInfo']})
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [(u'http://feeds.feedburner.com/michellemalkin/posts')]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?print=1'
|
50
resources/recipes/nationalreviewonline.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Walt Anthony <workshop.northpole at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.nationalreview.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NRO(BasicNewsRecipe):
|
||||||
|
title = u'National Review Online'
|
||||||
|
__author__ = 'Walt Anthony'
|
||||||
|
description = "National Review is America's most widely read and influential magazine and web site for Republican/conservative news, commentary, and opinion."
|
||||||
|
publisher = 'National Review, Inc.'
|
||||||
|
category = 'news, politics, USA'
|
||||||
|
oldest_article = 3
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
summary_length = 150
|
||||||
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
use_embedded_content = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['embed','object','iframe']),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
|
||||||
|
(u'National Review', u'http://www.nationalreview.com/index.xml'),
|
||||||
|
(u'The Corner', u'http://corner.nationalreview.com/corner.xml'),
|
||||||
|
(u'The Agenda', u'http://agenda.nationalreview.com/agenda.xml'),
|
||||||
|
(u'Bench Memos', u'http://bench.nationalreview.com/bench.xml'),
|
||||||
|
(u'Campaign Spot', u'http://campaignspot.nationalreview.com/campaignspot.xml'),
|
||||||
|
(u'Critical Care', u'http://healthcare.nationalreview.com/healthcare.xml'),
|
||||||
|
(u'Doctor, Doctor', u'http://www.nationalreview.com/doctor/doctor.xml'),
|
||||||
|
(u"Kudlow's Money Politic$", u'http://kudlow.nationalreview.com/kudlow.xml'),
|
||||||
|
(u'Media Blog', u'http://media.nationalreview.com/media.xml'),
|
||||||
|
(u'Phi Beta Cons', u'http://phibetacons.nationalreview.com/phibetacons.xml'),
|
||||||
|
(u'Planet Gore', u'http://planetgore.nationalreview.com/planetgore.xml')
|
||||||
|
|
||||||
|
]
|
40
resources/recipes/neowin.recipe
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Neowin(BasicNewsRecipe):
|
||||||
|
title = u'Neowin.net'
|
||||||
|
oldest_article = 5
|
||||||
|
language = 'en'
|
||||||
|
description = 'News from IT'
|
||||||
|
publisher = 'Neowin'
|
||||||
|
category = 'news, IT, Microsoft, Apple, hardware, software, games'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'comments' : description
|
||||||
|
,'publisher' : publisher
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
|
||||||
|
remove_tags_after = dict(name='div', attrs={'id':'tag-bar'})
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['base','object','link','iframe'])
|
||||||
|
,dict(name='div', attrs={'id':'tag-bar'})
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Software' , u'http://www.neowin.net/news/rss/software' )
|
||||||
|
,(u'Gaming' , u'http://www.neowin.net/news/rss/gaming' )
|
||||||
|
,(u'Microsoft', u'http://www.neowin.net/news/rss/microsoft')
|
||||||
|
,(u'Apple' , u'http://www.neowin.net/news/rss/apple' )
|
||||||
|
,(u'Editorial', u'http://www.neowin.net/news/rss/editorial')
|
||||||
|
]
|
||||||
|
def image_url_processor(cls, baseurl, url):
|
||||||
|
return url
|
||||||
|
|
@ -1,46 +1,42 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
nin.co.rs
|
www.nin.co.rs
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re, urllib
|
import re, urllib
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
class Nin(BasicNewsRecipe):
|
class Nin(BasicNewsRecipe):
|
||||||
title = 'NIN online'
|
title = 'NIN online'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Nedeljne informativne novine'
|
description = 'Nedeljne Informativne Novine'
|
||||||
publisher = 'NIN D.O.O.'
|
publisher = 'NIN d.o.o.'
|
||||||
category = 'news, politics, Serbia'
|
category = 'news, politics, Serbia'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
oldest_article = 15
|
oldest_article = 15
|
||||||
simultaneous_downloads = 1
|
|
||||||
delay = 1
|
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
remove_empty_feeds = True
|
||||||
PREFIX = 'http://www.nin.co.rs'
|
PREFIX = 'http://www.nin.co.rs'
|
||||||
INDEX = PREFIX + '/?change_lang=ls'
|
INDEX = PREFIX + '/?change_lang=ls'
|
||||||
LOGIN = PREFIX + '/?logout=true'
|
LOGIN = PREFIX + '/?logout=true'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'sr'
|
language = 'sr'
|
||||||
lang = 'sr-Latn-RS'
|
extra_css = ' @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Verdana, Lucida, sans1, sans-serif} .article_description{font-family: Verdana, Lucida, sans1, sans-serif} .artTitle{font-size: x-large; font-weight: bold; color: #900} .izjava{font-size: x-large; font-weight: bold} .columnhead{font-size: small; font-weight: bold;} img{margin-top:0.5em; margin-bottom: 0.7em} b{margin-top: 1em} '
|
||||||
direction = 'ltr'
|
|
||||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} .artTitle{font-size: x-large; font-weight: bold} .columnhead{font-size: small; font-weight: bold}'
|
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
, 'tags' : category
|
, 'tags' : category
|
||||||
, 'publisher' : publisher
|
, 'publisher' : publisher
|
||||||
, 'language' : language
|
, 'language' : language
|
||||||
, 'pretty_print' : True
|
, 'linearize_tables' : True
|
||||||
}
|
}
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
remove_attributes = ['height','width']
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
@ -65,35 +61,20 @@ class Nin(BasicNewsRecipe):
|
|||||||
cover_url = self.PREFIX + link_item['src']
|
cover_url = self.PREFIX + link_item['src']
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
soup.html['lang'] = self.lang
|
|
||||||
soup.html['dir' ] = self.direction
|
|
||||||
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
|
|
||||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
|
||||||
soup.head.insert(0,mlang)
|
|
||||||
soup.head.insert(1,mcharset)
|
|
||||||
attribs = [ 'style','font','valign'
|
|
||||||
,'colspan','width','height'
|
|
||||||
,'rowspan','summary','align'
|
|
||||||
,'cellspacing','cellpadding'
|
|
||||||
,'frames','rules','border'
|
|
||||||
]
|
|
||||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
|
||||||
item.name = 'div'
|
|
||||||
for attrib in attribs:
|
|
||||||
if item.has_key(attrib):
|
|
||||||
del item[attrib]
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
articles = []
|
articles = []
|
||||||
|
count = 0
|
||||||
soup = self.index_to_soup(self.PREFIX)
|
soup = self.index_to_soup(self.PREFIX)
|
||||||
for item in soup.findAll('a',attrs={'class':'lmeninavFont'}):
|
for item in soup.findAll('a',attrs={'class':'lmeninavFont'}):
|
||||||
|
count = count +1
|
||||||
|
if self.test and count > 2:
|
||||||
|
return articles
|
||||||
section = self.tag_to_string(item)
|
section = self.tag_to_string(item)
|
||||||
feedlink = self.PREFIX + item['href']
|
feedlink = self.PREFIX + item['href']
|
||||||
feedpage = self.index_to_soup(feedlink)
|
feedpage = self.index_to_soup(feedlink)
|
||||||
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
|
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
|
||||||
inarts = []
|
inarts = []
|
||||||
|
count2 = 0
|
||||||
for art in feedpage.findAll('span',attrs={'class':'artTitle'}):
|
for art in feedpage.findAll('span',attrs={'class':'artTitle'}):
|
||||||
alink = art.parent
|
alink = art.parent
|
||||||
url = self.PREFIX + alink['href']
|
url = self.PREFIX + alink['href']
|
||||||
@ -110,3 +91,4 @@ class Nin(BasicNewsRecipe):
|
|||||||
})
|
})
|
||||||
articles.append((section,inarts))
|
articles.append((section,inarts))
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
|
67
resources/recipes/nursingtimes.recipe
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.nursingtimes.net
|
||||||
|
'''
|
||||||
|
|
||||||
|
import urllib
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NursingTimes(BasicNewsRecipe):
|
||||||
|
title = 'Nursing Times'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Nursing practice, NHS and health care news'
|
||||||
|
oldest_article = 8
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf-8'
|
||||||
|
publisher = 'emap'
|
||||||
|
category = 'news, health, nursing, UK'
|
||||||
|
language = 'en-UK'
|
||||||
|
needs_subscription = True
|
||||||
|
LOGIN = 'http://www.nursingtimes.net/sign-in'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comments' : description
|
||||||
|
,'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'publisher' : publisher
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
br.open(self.LOGIN)
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
data = urllib.urlencode({ 'campaigncode' :'0'
|
||||||
|
,'referrer' :''
|
||||||
|
,'security_text':''
|
||||||
|
,'SIemail' :self.username
|
||||||
|
,'passWord' :self.password
|
||||||
|
,'LoginButton.x':'27'
|
||||||
|
,'LoginButton.y':'13'
|
||||||
|
})
|
||||||
|
br.open(self.LOGIN,data)
|
||||||
|
return br
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'storytext'})]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link','script','iframe'])
|
||||||
|
,dict(name='div',attrs={'id':'comments_form'})
|
||||||
|
]
|
||||||
|
remove_tags_after = dict(name='div',attrs={'id':'comments_form'})
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Breaking News', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=1')
|
||||||
|
,(u'Practice', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=512')
|
||||||
|
,(u'Behind the headlines', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=468')
|
||||||
|
,(u'Analysis', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=62')
|
||||||
|
,(u'Acute care news', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=5')
|
||||||
|
,(u'Primary vare news', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=231')
|
||||||
|
,(u'Mental Health news', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=27')
|
||||||
|
,(u'Management news', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=32')
|
||||||
|
,(u"Older people's nursing news", u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=181')
|
||||||
|
,(u'Respiratory news', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=177')
|
||||||
|
,(u'Wound care news', u'http://www.nursingtimes.net/XmlServers/navsectionRSS.aspx?navsectioncode=182')
|
||||||
|
]
|
||||||
|
|
@ -79,13 +79,30 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
.authorId {text-align: left; \
|
.authorId {text-align: left; \
|
||||||
font-style: italic;}\n '
|
font-style: italic;}\n '
|
||||||
|
|
||||||
# def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
# st = time.localtime()
|
cover = None
|
||||||
# year = str(st.tm_year)
|
st = time.localtime()
|
||||||
# month = "%.2d" % st.tm_mon
|
year = str(st.tm_year)
|
||||||
# day = "%.2d" % st.tm_mday
|
month = "%.2d" % st.tm_mon
|
||||||
# cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/' + 'scan.jpg'
|
day = "%.2d" % st.tm_mday
|
||||||
# return cover
|
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
self.log("\nCover unavailable")
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
|
||||||
|
def get_masthead_url(self):
|
||||||
|
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(masthead)
|
||||||
|
except:
|
||||||
|
self.log("\nCover unavailable")
|
||||||
|
masthead = None
|
||||||
|
return masthead
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
@ -5,16 +5,23 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
'''
|
'''
|
||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import string, re
|
import string, re, time
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
def decode(self, src):
|
||||||
|
enc = 'utf-8'
|
||||||
|
if 'iso-8859-1' in src:
|
||||||
|
enc = 'cp1252'
|
||||||
|
return src.decode(enc, 'ignore')
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The New York Times (subscription)'
|
title = 'The New York Times (subscription)'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
requires_version = (0, 6, 36)
|
||||||
|
|
||||||
description = 'Daily news from the New York Times (subscription version)'
|
description = 'Daily news from the New York Times (subscription version)'
|
||||||
timefmt = ' [%a, %b %d, %Y]'
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
@ -24,10 +31,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
||||||
dict(id=['footer', 'toolsRight', 'articleInline',
|
dict(id=['footer', 'toolsRight', 'articleInline',
|
||||||
'navigation', 'archive', 'side_search', 'blog_sidebar',
|
'navigation', 'archive', 'side_search', 'blog_sidebar',
|
||||||
'side_tool', 'side_index',
|
'side_tool', 'side_index', 'login', 'businessSearchBar',
|
||||||
|
'adxLeaderboard',
|
||||||
'relatedArticles', 'relatedTopics', 'adxSponLink']),
|
'relatedArticles', 'relatedTopics', 'adxSponLink']),
|
||||||
dict(name=['script', 'noscript', 'style'])]
|
dict(name=['script', 'noscript', 'style'])]
|
||||||
#encoding = 'cp1252'
|
encoding = decode
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
||||||
|
|
||||||
@ -44,13 +52,39 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
#open('/t/log.html', 'wb').write(raw)
|
#open('/t/log.html', 'wb').write(raw)
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
def get_masthead_url(self):
|
||||||
|
masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(masthead)
|
||||||
|
except:
|
||||||
|
self.log("\nCover unavailable")
|
||||||
|
masthead = None
|
||||||
|
return masthead
|
||||||
|
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover = None
|
||||||
|
st = time.localtime()
|
||||||
|
year = str(st.tm_year)
|
||||||
|
month = "%.2d" % st.tm_mon
|
||||||
|
day = "%.2d" % st.tm_mday
|
||||||
|
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
self.log("\nCover unavailable")
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
|
||||||
def short_title(self):
|
def short_title(self):
|
||||||
return 'NY Times'
|
return 'NY Times'
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
self.encoding = 'cp1252'
|
self.encoding = 'cp1252'
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
self.encoding = None
|
self.encoding = decode
|
||||||
|
|
||||||
def feed_title(div):
|
def feed_title(div):
|
||||||
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||||
|
@ -1,31 +1,40 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class NewsandObserver(BasicNewsRecipe):
|
class NewsandObserver(BasicNewsRecipe):
|
||||||
title = u'News and Observer'
|
title = u'Raleigh News & Observer'
|
||||||
description = 'News from Raleigh, North Carolina'
|
description = 'News from Raleigh, North Carolina'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
__author__ = 'Krittika Goyal'
|
__author__ = 'Krittika Goyal updated by Walt Anthony'
|
||||||
oldest_article = 5 #days
|
oldest_article = 3 #days
|
||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
|
summary_length = 150
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
remove_stylesheets = True
|
|
||||||
remove_tags_before = dict(name='h1', attrs={'id':'story_headline'})
|
remove_tags_before = dict(name='h1', attrs={'id':'story_headline'})
|
||||||
remove_tags_after = dict(name='div', attrs={'id':'story_text_remaining'})
|
remove_tags_after = dict(name='div', attrs={'id':'story_text_remaining'})
|
||||||
|
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='iframe'),
|
dict(name='iframe'),
|
||||||
dict(name='div', attrs={'id':['right-rail', 'story_tools']}),
|
dict(name='div', attrs={'id':['right-rail', 'story_tools', 'toolbox', 'toolbar', 'tool', 'shirttail', 'comment_widget', 'story_keywords', 'txtResizeTool']}),
|
||||||
|
dict(name='div', attrs={'class':['Buy-It-Now', 'story_link_share']}),
|
||||||
dict(name='ul', attrs={'class':'bold_tabs_nav'}),
|
dict(name='ul', attrs={'class':'bold_tabs_nav'}),
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Cover', 'http://www.newsobserver.com/100/index.rss'),
|
('Cover', 'http://www.newsobserver.com/100/index.rss'),
|
||||||
('News', 'http://www.newsobserver.com/102/index.rss'),
|
('News', 'http://www.newsobserver.com/102/index.rss'),
|
||||||
('Politics', 'http://www.newsobserver.com/105/index.rss'),
|
('Politics', 'http://www.newsobserver.com/105/index.rss'),
|
||||||
('Business', 'http://www.newsobserver.com/104/index.rss'),
|
('Business', 'http://www.newsobserver.com/104/index.rss'),
|
||||||
('Sports', 'http://www.newsobserver.com/103/index.rss'),
|
('Sports', 'http://www.newsobserver.com/103/index.rss'),
|
||||||
('College Sports', 'http://www.newsobserver.com/119/index.rss'),
|
('College Sports', 'http://www.newsobserver.com/119/index.rss'),
|
||||||
('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
|
('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
|
||||||
('Editorials', 'http://www.newsobserver.com/158/index.rss')]
|
('Editorials', 'http://www.newsobserver.com/158/index.rss')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
73
resources/recipes/oc_register.recipe
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Lorenzo Vigentini'
|
||||||
|
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||||
|
description = 'News from the Orange county - v1.01 (29, January 2010)'
|
||||||
|
|
||||||
|
'''
|
||||||
|
http://www.ocregister.com/
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class ocRegister(BasicNewsRecipe):
|
||||||
|
author = 'Lorenzo Vigentini'
|
||||||
|
description = 'News from the Orange county'
|
||||||
|
|
||||||
|
cover_url = 'http://images.onset.freedom.com/ocregister/logo.gif'
|
||||||
|
title = u'Orange County Register'
|
||||||
|
publisher = 'Orange County Register Communication'
|
||||||
|
category = 'News, finance, economy, politics'
|
||||||
|
|
||||||
|
language = 'en'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 10
|
||||||
|
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
def print_version(self,url):
|
||||||
|
printUrl = 'http://www.ocregister.com/common/printer/view.php?db=ocregister&id='
|
||||||
|
segments = url.split('/')
|
||||||
|
subSegments = (segments[4]).split('.')
|
||||||
|
myArticle = (subSegments[0]).replace('-', '')
|
||||||
|
myURL= printUrl + myArticle
|
||||||
|
return myURL
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'id':'ArticleContentWrap'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':'hideForPrint'}),
|
||||||
|
dict(name='div', attrs={'id':'ContentFooter'})
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'News', u'http://www.ocregister.com/common/rss/rss.php?catID=18800'),
|
||||||
|
(u'Today paper', u'http://www.ocregister.com/common/rss/rss.php?catID=18976'),
|
||||||
|
(u'Business', u'http://www.ocregister.com/common/rss/rss.php?catID=18909'),
|
||||||
|
(u'Cars', u'http://www.ocregister.com/common/rss/rss.php?catID=20128'),
|
||||||
|
(u'Entertainment', u'http://www.ocregister.com/common/rss/rss.php?catID=18926'),
|
||||||
|
(u'Home', u'http://www.ocregister.com/common/rss/rss.php?catID=19142'),
|
||||||
|
(u'Life', u'http://www.ocregister.com/common/rss/rss.php?catID=18936'),
|
||||||
|
(u'Opinion', u'http://www.ocregister.com/common/rss/rss.php?catID=18963'),
|
||||||
|
(u'Sports', u'http://www.ocregister.com/common/rss/rss.php?catID=18901'),
|
||||||
|
(u'Travel', u'http://www.ocregister.com/common/rss/rss.php?catID=18959')
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1 {color:#ff6600;font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:20px;}
|
||||||
|
h2 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:16px; }
|
||||||
|
h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:15px;}
|
||||||
|
h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:13px; }
|
||||||
|
h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:11px; text-transform:uppercase;}
|
||||||
|
#articledate {color:#333333;font-family:Arial,Helvetica,sans-serif;font-size:10px; font-size-adjust:none; font-stretch:normal; font-style:italic; font-variant:normal; font-weight:bold; line-height:10px; text-decoration:none;}
|
||||||
|
#articlebyline {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif;font-size:10px; font-size-adjust:none; font-stretch:normal; font-style:bold; font-variant:normal; font-weight:bold; line-height:10px; text-decoration:none;}
|
||||||
|
img {align:left;}
|
||||||
|
#topstoryhead {color:#ff6600;font-family:Arial,Helvetica,sans-serif; font-size:22px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:20px;}
|
||||||
|
'''
|
22
resources/recipes/open_left.recipe
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class OpenLeft(BasicNewsRecipe):
|
||||||
|
# Information about the recipe
|
||||||
|
|
||||||
|
title = 'Open Left'
|
||||||
|
description = 'Progressive American commentary on current events'
|
||||||
|
category = 'news, commentary'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'Xanthan Gum'
|
||||||
|
|
||||||
|
# Fetch no article older than seven days
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
|
||||||
|
# Fetch no more than 100 articles
|
||||||
|
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
# Fetch the articles from the RSS feed
|
||||||
|
|
||||||
|
feeds = [(u'Articles', u'http://www.openleft.com/rss/rss2.xml')]
|
@ -1,10 +1,12 @@
|
|||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
pagina12.com.ar
|
pagina12.com.ar
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import time
|
||||||
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Pagina12(BasicNewsRecipe):
|
class Pagina12(BasicNewsRecipe):
|
||||||
@ -19,6 +21,8 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
extra_css = ' body{font-family: sans-serif} '
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -47,3 +51,8 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
|
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
imgnames = ['tapan.jpg','tapagn.jpg','tapan_gr.jpg','tapagn.jpg','tapagn.jpg','tapan.jpg','tapagn.jpg']
|
||||||
|
weekday = time.localtime().tm_wday
|
||||||
|
return strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/') + imgnames[weekday]
|
||||||
|
|
||||||
|
55
resources/recipes/politiken_dk.recipe
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
politiken.dk
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Politiken_dk(BasicNewsRecipe):
|
||||||
|
title = 'Politiken.dk'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'News from Denmark'
|
||||||
|
publisher = 'politiken.dk'
|
||||||
|
category = 'news, politics, Denmark'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'cp1252'
|
||||||
|
language = 'da'
|
||||||
|
|
||||||
|
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h1{font-family: Georgia,"Times New Roman",Times,serif } '
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Tophistorier' , u'http://politiken.dk/rss/tophistorier.rss')
|
||||||
|
,(u'Seneste nyt' , u'http://politiken.dk/rss/senestenyt.rss')
|
||||||
|
,(u'Mest laeste' , u'http://politiken.dk/rss/mestlaeste.rss')
|
||||||
|
,(u'Danmark' , u'http://politiken.dk/rss/indland.rss')
|
||||||
|
,(u'Politik' , u'http://politiken.dk/rss/politik.rss')
|
||||||
|
,(u'Klima' , u'http://politiken.dk/rss/klima.rss')
|
||||||
|
,(u'Internationalt' , u'http://politiken.dk/rss/udland.rss')
|
||||||
|
,(u'Erhverv' , u'http://politiken.dk/rss/erhverv.rss')
|
||||||
|
,(u'Kultur' , u'http://politiken.dk/rss/kultur.rss')
|
||||||
|
,(u'Sport' , u'http://politiken.dk/rss/sport.rss')
|
||||||
|
,(u'Uddannelse' , u'http://politiken.dk/rss/uddannelse.rss')
|
||||||
|
,(u'Videnskab' , u'http://politiken.dk/rss/videnskab.rss')
|
||||||
|
]
|
||||||
|
remove_tags_before = dict(name='h1')
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link'])
|
||||||
|
,dict(name='div',attrs={'class':'footer'})
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?service=print'
|
||||||
|
|
41
resources/recipes/slovo.recipe
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
|
class SlovoRecipe(BasicNewsRecipe):
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Abelturd'
|
||||||
|
language = 'sk'
|
||||||
|
version = 1
|
||||||
|
|
||||||
|
title = u'SLOVO'
|
||||||
|
publisher = u''
|
||||||
|
category = u'News, Newspaper'
|
||||||
|
description = u'Politicko-spolo\u010densk\xfd t\xfd\u017edenn\xedk'
|
||||||
|
encoding = 'Windows-1250'
|
||||||
|
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
|
||||||
|
feeds = []
|
||||||
|
feeds.append((u'V\u0161etky \u010dl\xe1nky', u'http://www.noveslovo.sk/rss.asp'))
|
||||||
|
|
||||||
|
keep_only_tags = []
|
||||||
|
remove_tags = []
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'<img src="gif/image1.gif">', re.DOTALL|re.IGNORECASE),
|
||||||
|
lambda match: ''),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
m = re.search('(?<=id=)[0-9]*', url)
|
||||||
|
|
||||||
|
|
||||||
|
return u'http://www.noveslovo.sk/clanoktlac.asp?id=' + str(m.group(0))
|
@ -1,6 +1,5 @@
|
|||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
#from random import randint
|
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||||
@ -9,12 +8,11 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
description = 'Sports Illustrated'
|
description = 'Sports Illustrated'
|
||||||
version = 1
|
version = 3
|
||||||
title = u'Sports Illustrated'
|
title = u'Sports Illustrated'
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
#template_css = ''
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
|
||||||
INDEX = 'http://sportsillustrated.cnn.com/'
|
INDEX = 'http://sportsillustrated.cnn.com/'
|
||||||
@ -22,13 +20,39 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
answer = []
|
answer = []
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
# Find the link to the current issue on the front page.
|
# Find the link to the current issue on the front page. SI Cover
|
||||||
cover = soup.find('img', attrs = {'alt' : 'Read All Articles', 'style' : 'vertical-align:bottom;'})
|
cover = soup.find('img', attrs = {'alt' : 'Read All Articles', 'style' : 'vertical-align:bottom;'})
|
||||||
if cover:
|
if cover:
|
||||||
currentIssue = cover.parent['href']
|
currentIssue = cover.parent['href']
|
||||||
if currentIssue:
|
if currentIssue:
|
||||||
# Open the index of current issue
|
# Open the index of current issue
|
||||||
|
|
||||||
index = self.index_to_soup(currentIssue)
|
index = self.index_to_soup(currentIssue)
|
||||||
|
self.log('\tLooking for current issue in: ' + currentIssue)
|
||||||
|
# Now let us see if they updated their frontpage
|
||||||
|
nav = index.find('div', attrs = {'class': 'siv_trav_top'})
|
||||||
|
if nav:
|
||||||
|
img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_next_v2.jpg'})
|
||||||
|
if img:
|
||||||
|
parent = img.parent
|
||||||
|
if parent.name == 'a':
|
||||||
|
# They didn't update their frontpage; Load the next issue from here
|
||||||
|
href = self.INDEX + parent['href']
|
||||||
|
index = self.index_to_soup(href)
|
||||||
|
self.log('\tLooking for current issue in: ' + href)
|
||||||
|
|
||||||
|
if index.find('div', 'siv_noArticleMessage'):
|
||||||
|
nav = index.find('div', attrs = {'class': 'siv_trav_top'})
|
||||||
|
if nav:
|
||||||
|
# Their frontpage points to an issue without any articles; Use the previous issue
|
||||||
|
img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_previous_v2.jpg'})
|
||||||
|
if img:
|
||||||
|
parent = img.parent
|
||||||
|
if parent.name == 'a':
|
||||||
|
href = self.INDEX + parent['href']
|
||||||
|
index = self.index_to_soup(href)
|
||||||
|
self.log('\tLooking for current issue in: ' + href)
|
||||||
|
|
||||||
|
|
||||||
# Find all articles.
|
# Find all articles.
|
||||||
list = index.find('div', attrs = {'class' : 'siv_artList'})
|
list = index.find('div', attrs = {'class' : 'siv_artList'})
|
||||||
@ -69,31 +93,26 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
header = soup.find('div', attrs = {'class' : 'siv_artheader'})
|
header = soup.find('div', attrs = {'class' : 'siv_artheader'})
|
||||||
if header:
|
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
|
||||||
# It's an article, prepare a container for the content
|
body = homeMadeSoup.body
|
||||||
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
|
|
||||||
body = homeMadeSoup.find('body')
|
|
||||||
|
|
||||||
# Find the date, title and byline
|
# Find the date, title and byline
|
||||||
temp = header.find('td', attrs = {'class' : 'title'})
|
temp = header.find('td', attrs = {'class' : 'title'})
|
||||||
if temp :
|
if temp :
|
||||||
date = temp.find('div', attrs = {'class' : 'date'})
|
date = temp.find('div', attrs = {'class' : 'date'})
|
||||||
if date:
|
if date:
|
||||||
body.append(date)
|
body.append(date)
|
||||||
if temp.h1:
|
if temp.h1:
|
||||||
body.append(temp.h1)
|
body.append(temp.h1)
|
||||||
if temp.h2 :
|
if temp.h2 :
|
||||||
body.append(temp.h2)
|
body.append(temp.h2)
|
||||||
byline = temp.find('div', attrs = {'class' : 'byline'})
|
byline = temp.find('div', attrs = {'class' : 'byline'})
|
||||||
if byline:
|
if byline:
|
||||||
body.append(byline)
|
body.append(byline)
|
||||||
|
|
||||||
# Find the content
|
# Find the content
|
||||||
for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) :
|
for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) :
|
||||||
body.append(para)
|
body.append(para)
|
||||||
|
|
||||||
return homeMadeSoup
|
return homeMadeSoup
|
||||||
else :
|
|
||||||
# It's a TOC, just return the whole lot
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
22
resources/recipes/the_gazette.recipe
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class The_Gazette(BasicNewsRecipe):
|
||||||
|
|
||||||
|
cover_url = 'file:///D:/Documents/Pictures/Covers/The_Gazette.jpg'
|
||||||
|
title = u'The Gazette'
|
||||||
|
__author__ = 'Jerry Clapperton'
|
||||||
|
description = 'Montreal news in English'
|
||||||
|
language = 'en_CA'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':['storyheader','page1']})]
|
||||||
|
|
||||||
|
extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
|
||||||
|
|
||||||
|
feeds = [(u'News', u'http://feeds.canada.com/canwest/F297'), (u'Opinion', u'http://feeds.canada.com/canwest/F7383'), (u'Arts', u'http://feeds.canada.com/canwest/F7366'), (u'Life', u'http://rss.canada.com/get/?F6934'), (u'Business', u'http://feeds.canada.com/canwest/F6939'), (u'Travel', u'http://rss.canada.com/get/?F6938'), (u'Health', u'http://feeds.canada.com/canwest/F7397'), (u'Technology', u'http://feeds.canada.com/canwest/F7411')]
|
49
resources/recipes/the_week_magazine_free.recipe
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.theweek.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class TheWeekFree(BasicNewsRecipe):
|
||||||
|
title = 'The Week Magazine - Free content'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = "The best of the US and international media. Daily coverage of commentary and analysis of the day's events, as well as arts, entertainment, people and gossip, and political cartoons."
|
||||||
|
publisher = 'The Week Publications, Inc.'
|
||||||
|
category = 'news, politics, USA'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name=['h1','h2'])
|
||||||
|
, dict(name='div', attrs={'class':'basefont'})
|
||||||
|
, dict(name='div', attrs={'id':'slideshowLoader'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':['digg_dugg','articleRight','dateHeader']})
|
||||||
|
,dict(name=['object','embed','iframe'])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'News & Opinions' , u'http://www.theweek.com/section/index/news_opinion.rss')
|
||||||
|
,(u'Arts & Leisure' , u'http://www.theweek.com/section/index/arts_leisure.rss')
|
||||||
|
,(u'Business' , u'http://www.theweek.com/section/index/business.rss' )
|
||||||
|
,(u'Cartoon & Short takes' , u'http://www.theweek.com/section/index/cartoons_wit.rss')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
@ -46,3 +46,10 @@ class WashingtonPost(BasicNewsRecipe):
|
|||||||
div['style'] = ''
|
div['style'] = ''
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for tag in soup.findAll('font'):
|
||||||
|
if tag.has_key('size'):
|
||||||
|
if tag['size'] == '+2':
|
||||||
|
if tag.b:
|
||||||
|
return soup
|
||||||
|
return None
|
||||||
|
@ -1,44 +1,105 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__docformat__ = 'restructuredtext en'
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.wired.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Wired(BasicNewsRecipe):
|
class Wired(BasicNewsRecipe):
|
||||||
|
title = 'Wired Magazine'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Gaming news'
|
||||||
|
publisher = 'Conde Nast Digital'
|
||||||
|
category = 'news, games, IT, gadgets'
|
||||||
|
oldest_article = 32
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'en'
|
||||||
|
extra_css = ' body{font-family: sans-serif} .entryDescription li {display: inline; list-style-type: none} '
|
||||||
|
index = 'http://www.wired.com/magazine/'
|
||||||
|
|
||||||
title = 'Wired.com'
|
preprocess_regexps = [(re.compile(r'<meta name="Title".*<title>', re.DOTALL|re.IGNORECASE),lambda match: '<title>')]
|
||||||
__author__ = 'Kovid Goyal'
|
conversion_options = {
|
||||||
description = 'Technology news'
|
'comment' : description
|
||||||
timefmt = ' [%Y%b%d %H%M]'
|
, 'tags' : category
|
||||||
language = 'en'
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
no_stylesheets = True
|
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
|
||||||
|
remove_tags_after = dict(name='div', attrs={'class':'tweetmeme_button'})
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','embed','iframe','link'])
|
||||||
|
,dict(name='div', attrs={'class':['podcast_storyboard','tweetmeme_button']})
|
||||||
|
]
|
||||||
|
|
||||||
remove_tags_before = dict(name='div', id='content')
|
|
||||||
remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar',
|
|
||||||
'footer', 'advertisement', 'blog_subscription_unit',
|
|
||||||
'brightcove_component']),
|
|
||||||
{'class':'entryActions'},
|
|
||||||
dict(name=['noscript', 'script'])]
|
|
||||||
|
|
||||||
feeds = [
|
#feeds = [(u'Articles' , u'http://www.wired.com/magazine/feed/' )]
|
||||||
('Top News', 'http://feeds.wired.com/wired/index'),
|
|
||||||
('Culture', 'http://feeds.wired.com/wired/culture'),
|
def parse_index(self):
|
||||||
('Software', 'http://feeds.wired.com/wired/software'),
|
totalfeeds = []
|
||||||
('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'),
|
|
||||||
('Gadgets', 'http://feeds.wired.com/wired/gadgets'),
|
soup = self.index_to_soup(self.index)
|
||||||
('Cars', 'http://feeds.wired.com/wired/cars'),
|
features = soup.find('div',attrs={'id':'my-glider'})
|
||||||
('Entertainment', 'http://feeds.wired.com/wired/entertainment'),
|
if features:
|
||||||
('Gaming', 'http://feeds.wired.com/wired/gaming'),
|
farticles = []
|
||||||
('Science', 'http://feeds.wired.com/wired/science'),
|
for item in features.findAll('div',attrs={'class':'section'}):
|
||||||
('Med Tech', 'http://feeds.wired.com/wired/medtech'),
|
divurl = item.find('div',attrs={'class':'feature-header'})
|
||||||
('Politics', 'http://feeds.wired.com/wired/politics'),
|
divdesc = item.find('div',attrs={'class':'feature-text'})
|
||||||
('Tech Biz', 'http://feeds.wired.com/wired/techbiz'),
|
url = 'http://www.wired.com' + divurl.a['href']
|
||||||
('Commentary', 'http://feeds.wired.com/wired/commentary'),
|
title = self.tag_to_string(divurl.a)
|
||||||
]
|
description = self.tag_to_string(divdesc)
|
||||||
|
date = strftime(self.timefmt)
|
||||||
|
farticles.append({
|
||||||
|
'title' :title
|
||||||
|
,'date' :date
|
||||||
|
,'url' :url
|
||||||
|
,'description':description
|
||||||
|
})
|
||||||
|
totalfeeds.append(('Featured Articles', farticles))
|
||||||
|
#department feeds
|
||||||
|
departments = ['rants','start','test','play','found']
|
||||||
|
dept = soup.find('div',attrs={'id':'magazine-departments'})
|
||||||
|
if dept:
|
||||||
|
for ditem in departments:
|
||||||
|
darticles = []
|
||||||
|
department = dept.find('div',attrs={'id':'department-'+ditem})
|
||||||
|
if department:
|
||||||
|
for item in department.findAll('div'):
|
||||||
|
description = ''
|
||||||
|
feed_link = item.find('a')
|
||||||
|
if feed_link and feed_link.has_key('href'):
|
||||||
|
url = feed_link['href']
|
||||||
|
title = self.tag_to_string(feed_link)
|
||||||
|
date = strftime(self.timefmt)
|
||||||
|
darticles.append({
|
||||||
|
'title' :title
|
||||||
|
,'date' :date
|
||||||
|
,'url' :url
|
||||||
|
,'description':description
|
||||||
|
})
|
||||||
|
totalfeeds.append((ditem.capitalize(), darticles))
|
||||||
|
return totalfeeds
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
soup = self.index_to_soup(self.index)
|
||||||
|
cover_item = soup.find('div',attrs={'class':'spread-image'})
|
||||||
|
if cover_item:
|
||||||
|
cover_url = 'http://www.wired.com' + cover_item.a.img['src']
|
||||||
|
return cover_url
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url.replace('http://www.wired.com/', 'http://www.wired.com/print/')
|
return url.rstrip('/') + '/all/1'
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ online.wsj.com
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||||
from datetime import timedelta, datetime, date
|
from datetime import timedelta, date
|
||||||
|
|
||||||
class WSJ(BasicNewsRecipe):
|
class WSJ(BasicNewsRecipe):
|
||||||
# formatting adapted from original recipe by Kovid Goyal and Sujata Raman
|
# formatting adapted from original recipe by Kovid Goyal and Sujata Raman
|
||||||
@ -74,16 +74,33 @@ class WSJ(BasicNewsRecipe):
|
|||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self,soup):
|
def preprocess_html(self,soup):
|
||||||
|
|
||||||
|
def decode_us_date(datestr):
|
||||||
|
udate = datestr.strip().lower().split()
|
||||||
|
m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1
|
||||||
|
d = int(udate[1])
|
||||||
|
y = int(udate[2])
|
||||||
|
return date(y,m,d)
|
||||||
|
|
||||||
|
# check if article is paid content
|
||||||
|
if self.omit_paid_content:
|
||||||
|
divtags = soup.findAll('div','tooltip')
|
||||||
|
if divtags:
|
||||||
|
for divtag in divtags:
|
||||||
|
if divtag.find(text="Subscriber Content"):
|
||||||
|
return None
|
||||||
|
|
||||||
# check if article is too old
|
# check if article is too old
|
||||||
datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
|
datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
|
||||||
if datetag:
|
if datetag:
|
||||||
dateline_string = self.tag_to_string(datetag,False)
|
dateline_string = self.tag_to_string(datetag,False)
|
||||||
date_items = dateline_string.split(',')
|
date_items = dateline_string.split(',')
|
||||||
datestring = date_items[0]+date_items[1]
|
datestring = date_items[0]+date_items[1]
|
||||||
article_date = datetime.strptime(datestring.title(),"%B %d %Y")
|
article_date = decode_us_date(datestring)
|
||||||
earliest_date = date.today() - timedelta(days=self.oldest_article)
|
earliest_date = date.today() - timedelta(days=self.oldest_article)
|
||||||
if article_date.date() < earliest_date:
|
if article_date < earliest_date:
|
||||||
self.log("Skipping article dated %s" % datestring)
|
self.log("Skipping article dated %s" % datestring)
|
||||||
return None
|
return None
|
||||||
datetag.parent.extract()
|
datetag.parent.extract()
|
||||||
|
@ -20,4 +20,20 @@ function setup_image_scaling_handlers() {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function extract_svged_images() {
|
||||||
|
$("svg").each(function() {
|
||||||
|
var children = $(this).children("img");
|
||||||
|
if (children.length == 1) {
|
||||||
|
var img = $(children[0]);
|
||||||
|
var href = img.attr('xlink:href');
|
||||||
|
if (href != undefined) {
|
||||||
|
$(this).replaceWith('<div style="text-align:center; margin: 0; padding: 0"><img style="height: 98%" alt="SVG Image" src="' + href +'"></img></div>');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
$(document).ready(function() {
|
||||||
|
//extract_svged_images();
|
||||||
|
});
|
||||||
|
|
||||||
|
@ -112,6 +112,9 @@ class LinuxFreeze(Command):
|
|||||||
|
|
||||||
includes += ['calibre.gui2.convert.'+x.split('/')[-1].rpartition('.')[0] for x in \
|
includes += ['calibre.gui2.convert.'+x.split('/')[-1].rpartition('.')[0] for x in \
|
||||||
glob.glob('src/calibre/gui2/convert/*.py')]
|
glob.glob('src/calibre/gui2/convert/*.py')]
|
||||||
|
includes += ['calibre.gui2.catalog.'+x.split('/')[-1].rpartition('.')[0] for x in \
|
||||||
|
glob.glob('src/calibre/gui2/catalog/*.py')]
|
||||||
|
|
||||||
|
|
||||||
LOADER = '/tmp/loader.py'
|
LOADER = '/tmp/loader.py'
|
||||||
open(LOADER, 'wb').write('# This script is never actually used.\nimport sys')
|
open(LOADER, 'wb').write('# This script is never actually used.\nimport sys')
|
||||||
|
@ -266,6 +266,7 @@ class Py2App(object):
|
|||||||
def get_local_dependencies(self, path_to_lib):
|
def get_local_dependencies(self, path_to_lib):
|
||||||
for x in self.get_dependencies(path_to_lib):
|
for x in self.get_dependencies(path_to_lib):
|
||||||
for y in (SW+'/lib/', '/usr/local/lib/', SW+'/qt/lib/',
|
for y in (SW+'/lib/', '/usr/local/lib/', SW+'/qt/lib/',
|
||||||
|
'/opt/local/lib/',
|
||||||
'/Library/Frameworks/Python.framework/', SW+'/freetype/lib/'):
|
'/Library/Frameworks/Python.framework/', SW+'/freetype/lib/'):
|
||||||
if x.startswith(y):
|
if x.startswith(y):
|
||||||
if y == '/Library/Frameworks/Python.framework/':
|
if y == '/Library/Frameworks/Python.framework/':
|
||||||
@ -338,8 +339,8 @@ class Py2App(object):
|
|||||||
c = join(self.build_dir, 'Contents')
|
c = join(self.build_dir, 'Contents')
|
||||||
for x in ('Frameworks', 'MacOS', 'Resources'):
|
for x in ('Frameworks', 'MacOS', 'Resources'):
|
||||||
os.makedirs(join(c, x))
|
os.makedirs(join(c, x))
|
||||||
x = 'library.icns'
|
for x in ('library.icns', 'book.icns'):
|
||||||
shutil.copyfile(join('icons', x), join(self.resources_dir, x))
|
shutil.copyfile(join('icons', x), join(self.resources_dir, x))
|
||||||
|
|
||||||
@flush
|
@flush
|
||||||
def add_calibre_plugins(self):
|
def add_calibre_plugins(self):
|
||||||
@ -355,8 +356,13 @@ class Py2App(object):
|
|||||||
|
|
||||||
@flush
|
@flush
|
||||||
def create_plist(self):
|
def create_plist(self):
|
||||||
|
from calibre.ebooks import BOOK_EXTENSIONS
|
||||||
env = dict(**ENV)
|
env = dict(**ENV)
|
||||||
env['CALIBRE_LAUNCHED_FROM_BUNDLE']='1';
|
env['CALIBRE_LAUNCHED_FROM_BUNDLE']='1';
|
||||||
|
docs = [{'CFBundleTypeName':'E-book',
|
||||||
|
'CFBundleTypeExtensions':list(BOOK_EXTENSIONS),
|
||||||
|
'CFBundleTypeRole':'Viewer',
|
||||||
|
}]
|
||||||
|
|
||||||
pl = dict(
|
pl = dict(
|
||||||
CFBundleDevelopmentRegion='English',
|
CFBundleDevelopmentRegion='English',
|
||||||
@ -367,10 +373,11 @@ class Py2App(object):
|
|||||||
CFBundlePackageType='APPL',
|
CFBundlePackageType='APPL',
|
||||||
CFBundleSignature='????',
|
CFBundleSignature='????',
|
||||||
CFBundleExecutable='calibre',
|
CFBundleExecutable='calibre',
|
||||||
|
CFBundleDocumentTypes=docs,
|
||||||
LSMinimumSystemVersion='10.4.2',
|
LSMinimumSystemVersion='10.4.2',
|
||||||
LSRequiresNativeExecution=True,
|
LSRequiresNativeExecution=True,
|
||||||
NSAppleScriptEnabled=False,
|
NSAppleScriptEnabled=False,
|
||||||
NSHumanReadableCopyright='Copyright 2008, Kovid Goyal',
|
NSHumanReadableCopyright='Copyright 2010, Kovid Goyal',
|
||||||
CFBundleGetInfoString=('calibre, an E-book management '
|
CFBundleGetInfoString=('calibre, an E-book management '
|
||||||
'application. Visit http://calibre-ebook.com for details.'),
|
'application. Visit http://calibre-ebook.com for details.'),
|
||||||
CFBundleIconFile='library.icns',
|
CFBundleIconFile='library.icns',
|
||||||
@ -594,6 +601,7 @@ class Py2App(object):
|
|||||||
if x == 'Info.plist':
|
if x == 'Info.plist':
|
||||||
plist = plistlib.readPlist(join(self.contents_dir, x))
|
plist = plistlib.readPlist(join(self.contents_dir, x))
|
||||||
plist['LSUIElement'] = '1'
|
plist['LSUIElement'] = '1'
|
||||||
|
plist.pop('CFBundleDocumentTypes')
|
||||||
plistlib.writePlist(plist, join(cc_dir, x))
|
plistlib.writePlist(plist, join(cc_dir, x))
|
||||||
else:
|
else:
|
||||||
os.symlink(join('../..', x),
|
os.symlink(join('../..', x),
|
||||||
|
@ -117,9 +117,12 @@ def prints(*args, **kwargs):
|
|||||||
try:
|
try:
|
||||||
arg = arg.encode(enc)
|
arg = arg.encode(enc)
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
if not safe_encode:
|
try:
|
||||||
raise
|
arg = arg.encode('utf-8')
|
||||||
arg = repr(arg)
|
except:
|
||||||
|
if not safe_encode:
|
||||||
|
raise
|
||||||
|
arg = repr(arg)
|
||||||
if not isinstance(arg, str):
|
if not isinstance(arg, str):
|
||||||
try:
|
try:
|
||||||
arg = str(arg)
|
arg = str(arg)
|
||||||
@ -129,9 +132,12 @@ def prints(*args, **kwargs):
|
|||||||
try:
|
try:
|
||||||
arg = arg.encode(enc)
|
arg = arg.encode(enc)
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
if not safe_encode:
|
try:
|
||||||
raise
|
arg = arg.encode('utf-8')
|
||||||
arg = repr(arg)
|
except:
|
||||||
|
if not safe_encode:
|
||||||
|
raise
|
||||||
|
arg = repr(arg)
|
||||||
|
|
||||||
file.write(arg)
|
file.write(arg)
|
||||||
if i != len(args)-1:
|
if i != len(args)-1:
|
||||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
__appname__ = 'calibre'
|
__appname__ = 'calibre'
|
||||||
__version__ = '0.6.34'
|
__version__ = '0.6.36'
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
@ -2,10 +2,11 @@ from __future__ import with_statement
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import sys
|
import os, sys, zipfile
|
||||||
|
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
from calibre.constants import numeric_version
|
from calibre.constants import numeric_version
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
class Plugin(object):
|
class Plugin(object):
|
||||||
'''
|
'''
|
||||||
@ -231,6 +232,8 @@ class CatalogPlugin(Plugin):
|
|||||||
A plugin that implements a catalog generator.
|
A plugin that implements a catalog generator.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
resources_path = None
|
||||||
|
|
||||||
#: Output file type for which this plugin should be run
|
#: Output file type for which this plugin should be run
|
||||||
#: For example: 'epub' or 'xml'
|
#: For example: 'epub' or 'xml'
|
||||||
file_types = set([])
|
file_types = set([])
|
||||||
@ -249,14 +252,25 @@ class CatalogPlugin(Plugin):
|
|||||||
|
|
||||||
cli_options = []
|
cli_options = []
|
||||||
|
|
||||||
|
|
||||||
def search_sort_db(self, db, opts):
|
def search_sort_db(self, db, opts):
|
||||||
|
|
||||||
|
'''
|
||||||
|
# Don't add Catalogs to the generated Catalogs
|
||||||
|
cat = _('Catalog')
|
||||||
if opts.search_text:
|
if opts.search_text:
|
||||||
db.search(opts.search_text)
|
opts.search_text += ' not tag:'+cat
|
||||||
|
else:
|
||||||
|
opts.search_text = 'not tag:'+cat
|
||||||
|
'''
|
||||||
|
|
||||||
|
db.search(opts.search_text)
|
||||||
|
|
||||||
if opts.sort_by:
|
if opts.sort_by:
|
||||||
# 2nd arg = ascending
|
# 2nd arg = ascending
|
||||||
db.sort(opts.sort_by, True)
|
db.sort(opts.sort_by, True)
|
||||||
|
|
||||||
return db.get_data_as_dict()
|
return db.get_data_as_dict(ids=opts.ids)
|
||||||
|
|
||||||
def get_output_fields(self, opts):
|
def get_output_fields(self, opts):
|
||||||
# Return a list of requested fields, with opts.sort_by first
|
# Return a list of requested fields, with opts.sort_by first
|
||||||
@ -272,11 +286,40 @@ class CatalogPlugin(Plugin):
|
|||||||
fields = list(all_fields & requested_fields)
|
fields = list(all_fields & requested_fields)
|
||||||
else:
|
else:
|
||||||
fields = list(all_fields)
|
fields = list(all_fields)
|
||||||
|
|
||||||
fields.sort()
|
fields.sort()
|
||||||
fields.insert(0,fields.pop(int(fields.index(opts.sort_by))))
|
if opts.sort_by and opts.sort_by in fields:
|
||||||
|
fields.insert(0,fields.pop(int(fields.index(opts.sort_by))))
|
||||||
return fields
|
return fields
|
||||||
|
|
||||||
def run(self, path_to_output, opts, db):
|
def initialize(self):
|
||||||
|
'''
|
||||||
|
If plugin is not a built-in, copy the plugin's .ui and .py files from
|
||||||
|
the zip file to $TMPDIR.
|
||||||
|
Tab will be dynamically generated and added to the Catalog Options dialog in
|
||||||
|
calibre.gui2.dialogs.catalog.py:Catalog
|
||||||
|
'''
|
||||||
|
from calibre.customize.builtins import plugins as builtin_plugins
|
||||||
|
from calibre.customize.ui import config
|
||||||
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
|
|
||||||
|
if not type(self) in builtin_plugins and \
|
||||||
|
not self.name in config['disabled_plugins']:
|
||||||
|
files_to_copy = ["%s.%s" % (self.name.lower(),ext) for ext in ["ui","py"]]
|
||||||
|
resources = zipfile.ZipFile(self.plugin_path,'r')
|
||||||
|
|
||||||
|
if self.resources_path is None:
|
||||||
|
self.resources_path = PersistentTemporaryDirectory('_plugin_resources', prefix='')
|
||||||
|
|
||||||
|
for file in files_to_copy:
|
||||||
|
try:
|
||||||
|
resources.extract(file, self.resources_path)
|
||||||
|
except:
|
||||||
|
print " customize:__init__.initialize(): %s not found in %s" % (file, os.path.basename(self.plugin_path))
|
||||||
|
continue
|
||||||
|
resources.close()
|
||||||
|
|
||||||
|
def run(self, path_to_output, opts, db, ids, notification=None):
|
||||||
'''
|
'''
|
||||||
Run the plugin. Must be implemented in subclasses.
|
Run the plugin. Must be implemented in subclasses.
|
||||||
It should generate the catalog in the format specified
|
It should generate the catalog in the format specified
|
||||||
|
@ -80,6 +80,17 @@ class PML2PMLZ(FileTypePlugin):
|
|||||||
return of.name
|
return of.name
|
||||||
|
|
||||||
|
|
||||||
|
# CHM MODIFIED
|
||||||
|
class CHMMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
|
name = 'Read CHM metadata'
|
||||||
|
file_types = set(['chm'])
|
||||||
|
description = _('Read metadata from %s files') % 'CHM'
|
||||||
|
|
||||||
|
def get_metadata(self, stream, ftype):
|
||||||
|
from calibre.ebooks.metadata.chm import get_metadata
|
||||||
|
return get_metadata(stream)
|
||||||
|
|
||||||
class ComicMetadataReader(MetadataReaderPlugin):
|
class ComicMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read comic metadata'
|
name = 'Read comic metadata'
|
||||||
@ -382,7 +393,7 @@ from calibre.ebooks.rtf.input import RTFInput
|
|||||||
from calibre.ebooks.tcr.input import TCRInput
|
from calibre.ebooks.tcr.input import TCRInput
|
||||||
from calibre.ebooks.txt.input import TXTInput
|
from calibre.ebooks.txt.input import TXTInput
|
||||||
from calibre.ebooks.lrf.input import LRFInput
|
from calibre.ebooks.lrf.input import LRFInput
|
||||||
from calibre.ebooks.chm.input import CHMInput # XXMODIFIED
|
from calibre.ebooks.chm.input import CHMInput # CHM MODIFIED
|
||||||
|
|
||||||
from calibre.ebooks.epub.output import EPUBOutput
|
from calibre.ebooks.epub.output import EPUBOutput
|
||||||
from calibre.ebooks.fb2.output import FB2Output
|
from calibre.ebooks.fb2.output import FB2Output
|
||||||
@ -405,7 +416,7 @@ from calibre.devices.hanlin.driver import HANLINV3, HANLINV5, BOOX
|
|||||||
from calibre.devices.blackberry.driver import BLACKBERRY
|
from calibre.devices.blackberry.driver import BLACKBERRY
|
||||||
from calibre.devices.cybook.driver import CYBOOK
|
from calibre.devices.cybook.driver import CYBOOK
|
||||||
from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \
|
from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \
|
||||||
POCKETBOOK360, GER2, ITALICA, ECLICTO, DBOOK
|
POCKETBOOK360, GER2, ITALICA, ECLICTO, DBOOK, INVESBOOK
|
||||||
from calibre.devices.iliad.driver import ILIAD
|
from calibre.devices.iliad.driver import ILIAD
|
||||||
from calibre.devices.irexdr.driver import IREXDR1000
|
from calibre.devices.irexdr.driver import IREXDR1000
|
||||||
from calibre.devices.jetbook.driver import JETBOOK
|
from calibre.devices.jetbook.driver import JETBOOK
|
||||||
@ -422,8 +433,8 @@ from calibre.devices.binatone.driver import README
|
|||||||
from calibre.devices.hanvon.driver import N516
|
from calibre.devices.hanvon.driver import N516
|
||||||
|
|
||||||
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon
|
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon
|
||||||
from calibre.library.catalog import CSV_XML
|
from calibre.library.catalog import CSV_XML, EPUB_MOBI
|
||||||
plugins = [HTML2ZIP, PML2PMLZ, GoogleBooks, ISBNDB, Amazon, CSV_XML]
|
plugins = [HTML2ZIP, PML2PMLZ, GoogleBooks, ISBNDB, Amazon, CSV_XML, EPUB_MOBI]
|
||||||
plugins += [
|
plugins += [
|
||||||
ComicInput,
|
ComicInput,
|
||||||
EPUBInput,
|
EPUBInput,
|
||||||
@ -441,7 +452,7 @@ plugins += [
|
|||||||
TCRInput,
|
TCRInput,
|
||||||
TXTInput,
|
TXTInput,
|
||||||
LRFInput,
|
LRFInput,
|
||||||
CHMInput, # XXMODIFIED
|
CHMInput, # CHM MODIFIED
|
||||||
]
|
]
|
||||||
plugins += [
|
plugins += [
|
||||||
EPUBOutput,
|
EPUBOutput,
|
||||||
@ -487,6 +498,7 @@ plugins += [
|
|||||||
ITALICA,
|
ITALICA,
|
||||||
ECLICTO,
|
ECLICTO,
|
||||||
DBOOK,
|
DBOOK,
|
||||||
|
INVESBOOK,
|
||||||
BOOX,
|
BOOX,
|
||||||
EB600,
|
EB600,
|
||||||
README,
|
README,
|
||||||
|
@ -85,6 +85,9 @@ class OptionRecommendation(object):
|
|||||||
|
|
||||||
class DummyReporter(object):
|
class DummyReporter(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.cancel_requested = False
|
||||||
|
|
||||||
def __call__(self, percent, msg=''):
|
def __call__(self, percent, msg=''):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -9,23 +9,22 @@ from calibre.devices.usbms.driver import USBMS
|
|||||||
class BLACKBERRY(USBMS):
|
class BLACKBERRY(USBMS):
|
||||||
|
|
||||||
name = 'Blackberry Device Interface'
|
name = 'Blackberry Device Interface'
|
||||||
|
gui_name = 'Blackberry'
|
||||||
description = _('Communicate with the Blackberry smart phone.')
|
description = _('Communicate with the Blackberry smart phone.')
|
||||||
author = _('Kovid Goyal')
|
author = _('Kovid Goyal')
|
||||||
supported_platforms = ['windows', 'linux']
|
supported_platforms = ['windows', 'linux', 'osx']
|
||||||
|
|
||||||
# Ordered list of supported formats
|
# Ordered list of supported formats
|
||||||
FORMATS = ['mobi', 'prc']
|
FORMATS = ['mobi', 'prc']
|
||||||
|
|
||||||
VENDOR_ID = [0x0fca]
|
VENDOR_ID = [0x0fca]
|
||||||
PRODUCT_ID = [0x8004, 0x0004]
|
PRODUCT_ID = [0x8004, 0x0004]
|
||||||
BCD = [0x0200, 0x0107]
|
BCD = [0x0200, 0x0107, 0x0210, 0x0201]
|
||||||
|
|
||||||
VENDOR_NAME = 'RIM'
|
VENDOR_NAME = 'RIM'
|
||||||
WINDOWS_MAIN_MEM = 'BLACKBERRY_SD'
|
WINDOWS_MAIN_MEM = 'BLACKBERRY_SD'
|
||||||
|
|
||||||
#OSX_MAIN_MEM = 'Kindle Internal Storage Media'
|
|
||||||
|
|
||||||
MAIN_MEMORY_VOLUME_LABEL = 'Blackberry SD Card'
|
MAIN_MEMORY_VOLUME_LABEL = 'Blackberry SD Card'
|
||||||
|
|
||||||
EBOOK_DIR_MAIN = 'ebooks'
|
EBOOK_DIR_MAIN = 'eBooks'
|
||||||
SUPPORTS_SUB_DIRS = True
|
SUPPORTS_SUB_DIRS = True
|
||||||
|
@ -154,7 +154,7 @@ class ECLICTO(EB600):
|
|||||||
name = 'eClicto Device Interface'
|
name = 'eClicto Device Interface'
|
||||||
gui_name = 'eClicto'
|
gui_name = 'eClicto'
|
||||||
|
|
||||||
FORMATS = ['epub', 'pdf', 'txt']
|
FORMATS = ['epub', 'pdf', 'htm', 'html', 'txt']
|
||||||
|
|
||||||
VENDOR_NAME = 'ECLICTO'
|
VENDOR_NAME = 'ECLICTO'
|
||||||
WINDOWS_MAIN_MEM = 'EBOOK'
|
WINDOWS_MAIN_MEM = 'EBOOK'
|
||||||
@ -173,3 +173,14 @@ class DBOOK(EB600):
|
|||||||
VENDOR_NAME = 'INFINITY'
|
VENDOR_NAME = 'INFINITY'
|
||||||
WINDOWS_MAIN_MEM = 'AIRIS_DBOOK'
|
WINDOWS_MAIN_MEM = 'AIRIS_DBOOK'
|
||||||
WINDOWS_CARD_A_MEM = 'AIRIS_DBOOK'
|
WINDOWS_CARD_A_MEM = 'AIRIS_DBOOK'
|
||||||
|
|
||||||
|
class INVESBOOK(EB600):
|
||||||
|
|
||||||
|
name = 'Inves Book Device Interface'
|
||||||
|
gui_name = 'Inves Book 600'
|
||||||
|
|
||||||
|
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'html', 'pdf', 'rtf', 'txt']
|
||||||
|
|
||||||
|
VENDOR_NAME = 'INVES_E6'
|
||||||
|
WINDOWS_MAIN_MEM = '00INVES_E600'
|
||||||
|
WINDOWS_CARD_A_MEM = '00INVES_E600'
|
||||||
|
@ -23,7 +23,7 @@ class N516(USBMS):
|
|||||||
|
|
||||||
VENDOR_ID = [0x0525]
|
VENDOR_ID = [0x0525]
|
||||||
PRODUCT_ID = [0xa4a5]
|
PRODUCT_ID = [0xa4a5]
|
||||||
BCD = [0x323]
|
BCD = [0x323, 0x326]
|
||||||
|
|
||||||
VENDOR_NAME = 'INGENIC'
|
VENDOR_NAME = 'INGENIC'
|
||||||
WINDOWS_MAIN_MEM = '_FILE-STOR_GADGE'
|
WINDOWS_MAIN_MEM = '_FILE-STOR_GADGE'
|
||||||
|
@ -71,7 +71,7 @@ int do_mount(const char *dev, const char *mp) {
|
|||||||
#ifdef __NetBSD__
|
#ifdef __NetBSD__
|
||||||
execlp("mount_msdos", "mount_msdos", "-u", uids, "-g", gids, "-o", options, dev, mp, NULL);
|
execlp("mount_msdos", "mount_msdos", "-u", uids, "-g", gids, "-o", options, dev, mp, NULL);
|
||||||
#else
|
#else
|
||||||
execlp("mount", "mount", "-t", "vfat", "-o", options, dev, mp, NULL);
|
execlp("mount", "mount", "-t", "auto", "-o", options, dev, mp, NULL);
|
||||||
#endif
|
#endif
|
||||||
errsv = errno;
|
errsv = errno;
|
||||||
fprintf(stderr, "Failed to mount with error: %s\n", strerror(errsv));
|
fprintf(stderr, "Failed to mount with error: %s\n", strerror(errsv));
|
||||||
|
@ -86,4 +86,5 @@ class NOOK(USBMS):
|
|||||||
|
|
||||||
return drives
|
return drives
|
||||||
|
|
||||||
|
def sanitize_path_components(self, components):
|
||||||
|
return [x.replace('#', '_') for x in components]
|
||||||
|
@ -274,7 +274,7 @@ class BookList(_BookList):
|
|||||||
node.setAttribute(attr, attrs[attr])
|
node.setAttribute(attr, attrs[attr])
|
||||||
try:
|
try:
|
||||||
w, h, data = mi.thumbnail
|
w, h, data = mi.thumbnail
|
||||||
except TypeError:
|
except:
|
||||||
w, h, data = None, None, None
|
w, h, data = None, None, None
|
||||||
|
|
||||||
if data:
|
if data:
|
||||||
|
@ -782,6 +782,13 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
'''
|
'''
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
def sanitize_path_components(self, components):
|
||||||
|
'''
|
||||||
|
Perform any device specific sanitization on the path components
|
||||||
|
for files to be uploaded to the device
|
||||||
|
'''
|
||||||
|
return components
|
||||||
|
|
||||||
def create_upload_path(self, path, mdata, fname):
|
def create_upload_path(self, path, mdata, fname):
|
||||||
path = os.path.abspath(path)
|
path = os.path.abspath(path)
|
||||||
extra_components = []
|
extra_components = []
|
||||||
@ -801,6 +808,8 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
ext = os.path.splitext(fname)[1]
|
ext = os.path.splitext(fname)[1]
|
||||||
|
|
||||||
from calibre.library.save_to_disk import get_components
|
from calibre.library.save_to_disk import get_components
|
||||||
|
if not isinstance(template, unicode):
|
||||||
|
template = template.decode('utf-8')
|
||||||
extra_components = get_components(template, mdata, fname)
|
extra_components = get_components(template, mdata, fname)
|
||||||
if not extra_components:
|
if not extra_components:
|
||||||
extra_components.append(sanitize(self.filename_callback(fname,
|
extra_components.append(sanitize(self.filename_callback(fname,
|
||||||
@ -834,6 +843,7 @@ class Device(DeviceConfig, DevicePlugin):
|
|||||||
|
|
||||||
extra_components = list(map(remove_trailing_periods, extra_components))
|
extra_components = list(map(remove_trailing_periods, extra_components))
|
||||||
components = shorten_components_to(250 - len(path), extra_components)
|
components = shorten_components_to(250 - len(path), extra_components)
|
||||||
|
components = self.sanitize_path_components(components)
|
||||||
filepath = os.path.join(path, *components)
|
filepath = os.path.join(path, *components)
|
||||||
filedir = os.path.dirname(filepath)
|
filedir = os.path.dirname(filepath)
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ class DRMError(ValueError):
|
|||||||
|
|
||||||
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
|
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
|
||||||
'html', 'xhtml', 'pdf', 'pdb', 'prc', 'mobi', 'azw', 'doc',
|
'html', 'xhtml', 'pdf', 'pdb', 'prc', 'mobi', 'azw', 'doc',
|
||||||
'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'oebzip',
|
'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
|
||||||
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml']
|
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml']
|
||||||
|
|
||||||
class HTMLRenderer(object):
|
class HTMLRenderer(object):
|
||||||
|
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
Based on ideas from comiclrf created by FangornUK.
|
Based on ideas from comiclrf created by FangornUK.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os, shutil, traceback, textwrap, time
|
import os, shutil, traceback, textwrap, time, codecs
|
||||||
from ctypes import byref
|
from ctypes import byref
|
||||||
from Queue import Empty
|
from Queue import Empty
|
||||||
|
|
||||||
@ -338,8 +338,12 @@ class ComicInput(InputFormatPlugin):
|
|||||||
if not os.path.exists('comics.txt'):
|
if not os.path.exists('comics.txt'):
|
||||||
raise ValueError('%s is not a valid comic collection'
|
raise ValueError('%s is not a valid comic collection'
|
||||||
%stream.name)
|
%stream.name)
|
||||||
for line in open('comics.txt',
|
raw = open('comics.txt', 'rb').read().decode('utf-8')
|
||||||
'rb').read().decode('utf-8').splitlines():
|
raw.lstrip(unicode(codecs.BOM_UTF8, "utf8" ))
|
||||||
|
for line in raw.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
fname, title = line.partition(':')[0], line.partition(':')[-1]
|
fname, title = line.partition(':')[0], line.partition(':')[-1]
|
||||||
fname = os.path.join(tdir, *fname.split('/'))
|
fname = os.path.join(tdir, *fname.split('/'))
|
||||||
if not title:
|
if not title:
|
||||||
|
@ -268,7 +268,8 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
# remove <img> tags with empty src elements
|
# remove <img> tags with empty src elements
|
||||||
bad = []
|
bad = []
|
||||||
for x in XPath('//h:img')(body):
|
for x in XPath('//h:img')(body):
|
||||||
if not x.get('src', '').strip():
|
src = x.get('src', '').strip()
|
||||||
|
if src in ('', '#') or src.startswith('http:'):
|
||||||
bad.append(x)
|
bad.append(x)
|
||||||
for img in bad:
|
for img in bad:
|
||||||
img.getparent().remove(img)
|
img.getparent().remove(img)
|
||||||
|
@ -132,7 +132,8 @@ class FB2MLizer(object):
|
|||||||
href = self.oeb_book.guide['titlepage'].href
|
href = self.oeb_book.guide['titlepage'].href
|
||||||
item = self.oeb_book.manifest.hrefs[href]
|
item = self.oeb_book.manifest.hrefs[href]
|
||||||
if item.spine_position is None:
|
if item.spine_position is None:
|
||||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, self.oeb_book,
|
||||||
|
self.opts, self.opts.output_profile)
|
||||||
output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
|
output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@ -152,7 +153,7 @@ class FB2MLizer(object):
|
|||||||
text = []
|
text = []
|
||||||
for item in self.oeb_book.spine:
|
for item in self.oeb_book.spine:
|
||||||
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
||||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
|
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
text.append(self.add_page_anchor(item))
|
text.append(self.add_page_anchor(item))
|
||||||
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
|
||||||
return ''.join(text)
|
return ''.join(text)
|
||||||
|
@ -26,6 +26,11 @@ class LITInput(InputFormatPlugin):
|
|||||||
for item in oeb.spine:
|
for item in oeb.spine:
|
||||||
root = item.data
|
root = item.data
|
||||||
if not hasattr(root, 'xpath'): continue
|
if not hasattr(root, 'xpath'): continue
|
||||||
|
for bad in ('metadata', 'guide'):
|
||||||
|
metadata = XPath('//h:'+bad)(root)
|
||||||
|
if metadata:
|
||||||
|
for x in metadata:
|
||||||
|
x.getparent().remove(x)
|
||||||
body = XPath('//h:body')(root)
|
body = XPath('//h:body')(root)
|
||||||
if body:
|
if body:
|
||||||
body = body[0]
|
body = body[0]
|
||||||
|
@ -32,7 +32,7 @@ class LITOutput(OutputFormatPlugin):
|
|||||||
mangler(oeb, opts)
|
mangler(oeb, opts)
|
||||||
rasterizer = SVGRasterizer()
|
rasterizer = SVGRasterizer()
|
||||||
rasterizer(oeb, opts)
|
rasterizer(oeb, opts)
|
||||||
lit = LitWriter()
|
lit = LitWriter(self.opts)
|
||||||
lit(oeb, output_path)
|
lit(oeb, output_path)
|
||||||
|
|
||||||
|
|
||||||
|
@ -134,7 +134,7 @@ def warn(x):
|
|||||||
class ReBinary(object):
|
class ReBinary(object):
|
||||||
NSRMAP = {'': None, XML_NS: 'xml'}
|
NSRMAP = {'': None, XML_NS: 'xml'}
|
||||||
|
|
||||||
def __init__(self, root, item, oeb, map=HTML_MAP):
|
def __init__(self, root, item, oeb, opts, map=HTML_MAP):
|
||||||
self.item = item
|
self.item = item
|
||||||
self.logger = oeb.logger
|
self.logger = oeb.logger
|
||||||
self.manifest = oeb.manifest
|
self.manifest = oeb.manifest
|
||||||
@ -143,7 +143,7 @@ class ReBinary(object):
|
|||||||
self.anchors = []
|
self.anchors = []
|
||||||
self.page_breaks = []
|
self.page_breaks = []
|
||||||
self.is_html = is_html = map is HTML_MAP
|
self.is_html = is_html = map is HTML_MAP
|
||||||
self.stylizer = Stylizer(root, item.href, oeb) if is_html else None
|
self.stylizer = Stylizer(root, item.href, oeb, opts) if is_html else None
|
||||||
self.tree_to_binary(root)
|
self.tree_to_binary(root)
|
||||||
self.content = self.buf.getvalue()
|
self.content = self.buf.getvalue()
|
||||||
self.ahc = self.build_ahc() if is_html else None
|
self.ahc = self.build_ahc() if is_html else None
|
||||||
@ -295,9 +295,8 @@ def preserve(function):
|
|||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
class LitWriter(object):
|
class LitWriter(object):
|
||||||
def __init__(self):
|
def __init__(self, opts):
|
||||||
# Wow, no options
|
self.opts = opts
|
||||||
pass
|
|
||||||
|
|
||||||
def _litize_oeb(self):
|
def _litize_oeb(self):
|
||||||
oeb = self._oeb
|
oeb = self._oeb
|
||||||
@ -469,7 +468,7 @@ class LitWriter(object):
|
|||||||
secnum = 0
|
secnum = 0
|
||||||
if isinstance(data, etree._Element):
|
if isinstance(data, etree._Element):
|
||||||
self._add_folder(name)
|
self._add_folder(name)
|
||||||
rebin = ReBinary(data, item, self._oeb, map=HTML_MAP)
|
rebin = ReBinary(data, item, self._oeb, self.opts, map=HTML_MAP)
|
||||||
self._add_file(name + '/ahc', rebin.ahc, 0)
|
self._add_file(name + '/ahc', rebin.ahc, 0)
|
||||||
self._add_file(name + '/aht', rebin.aht, 0)
|
self._add_file(name + '/aht', rebin.aht, 0)
|
||||||
item.page_breaks = rebin.page_breaks
|
item.page_breaks = rebin.page_breaks
|
||||||
@ -562,7 +561,7 @@ class LitWriter(object):
|
|||||||
meta.attrib['ms--minimum_level'] = '0'
|
meta.attrib['ms--minimum_level'] = '0'
|
||||||
meta.attrib['ms--attr5'] = '1'
|
meta.attrib['ms--attr5'] = '1'
|
||||||
meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper()
|
meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper()
|
||||||
rebin = ReBinary(meta, None, self._oeb, map=OPF_MAP)
|
rebin = ReBinary(meta, None, self._oeb, self.opts, map=OPF_MAP)
|
||||||
meta = rebin.content
|
meta = rebin.content
|
||||||
self._meta = meta
|
self._meta = meta
|
||||||
self._add_file('/meta', meta)
|
self._add_file('/meta', meta)
|
||||||
|
@ -63,6 +63,7 @@ def get_social_metadata(title, authors, publisher, isbn):
|
|||||||
mi.tags = []
|
mi.tags = []
|
||||||
for x in tags:
|
for x in tags:
|
||||||
mi.tags.extend([y.strip() for y in x.split('/')])
|
mi.tags.extend([y.strip() for y in x.split('/')])
|
||||||
|
mi.tags = [x.replace(',', ';') for x in mi.tags]
|
||||||
comments = root.find('.//%s/%s'%(AWS('EditorialReview'),
|
comments = root.find('.//%s/%s'%(AWS('EditorialReview'),
|
||||||
AWS('Content')))
|
AWS('Content')))
|
||||||
if comments is not None:
|
if comments is not None:
|
||||||
|
@ -143,7 +143,7 @@ class ResultList(list):
|
|||||||
except:
|
except:
|
||||||
report(verbose)
|
report(verbose)
|
||||||
tags = []
|
tags = []
|
||||||
return tags
|
return [x.replace(',', ';') for x in tags]
|
||||||
|
|
||||||
def get_publisher(self, entry, verbose):
|
def get_publisher(self, entry, verbose):
|
||||||
try:
|
try:
|
||||||
|
@ -134,7 +134,10 @@ def metadata_from_filename(name, pat=None):
|
|||||||
mi.authors = aus
|
mi.authors = aus
|
||||||
if prefs['swap_author_names'] and mi.authors:
|
if prefs['swap_author_names'] and mi.authors:
|
||||||
def swap(a):
|
def swap(a):
|
||||||
parts = a.split()
|
if ',' in a:
|
||||||
|
parts = a.split(',', 1)
|
||||||
|
else:
|
||||||
|
parts = a.split(None, 1)
|
||||||
if len(parts) > 1:
|
if len(parts) > 1:
|
||||||
t = parts[-1]
|
t = parts[-1]
|
||||||
parts = parts[:-1]
|
parts = parts[:-1]
|
||||||
|
@ -17,6 +17,8 @@ from calibre.ebooks.mobi import MobiError
|
|||||||
from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN
|
from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN
|
||||||
from calibre.ebooks.mobi.langcodes import iana2mobi
|
from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||||
|
|
||||||
|
import struct
|
||||||
|
|
||||||
class StreamSlicer(object):
|
class StreamSlicer(object):
|
||||||
|
|
||||||
def __init__(self, stream, start=0, stop=None):
|
def __init__(self, stream, start=0, stop=None):
|
||||||
@ -72,25 +74,54 @@ class StreamSlicer(object):
|
|||||||
return stream.write(value)
|
return stream.write(value)
|
||||||
raise TypeError("stream indices must be integers")
|
raise TypeError("stream indices must be integers")
|
||||||
|
|
||||||
|
def update(self, data_blocks):
|
||||||
|
# Rewrite the stream
|
||||||
|
stream = self._stream
|
||||||
|
base = self.start
|
||||||
|
stream.seek(base)
|
||||||
|
self._stream.truncate(base)
|
||||||
|
for block in data_blocks:
|
||||||
|
stream.write(block)
|
||||||
|
|
||||||
|
def truncate(self, value):
|
||||||
|
self._stream.truncate(value)
|
||||||
|
|
||||||
class MetadataUpdater(object):
|
class MetadataUpdater(object):
|
||||||
def __init__(self, stream):
|
def __init__(self, stream):
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
data = self.data = StreamSlicer(stream)
|
data = self.data = StreamSlicer(stream)
|
||||||
type = self.type = data[60:68]
|
self.type = data[60:68]
|
||||||
|
|
||||||
|
if self.type != "BOOKMOBI":
|
||||||
|
return
|
||||||
|
|
||||||
self.nrecs, = unpack('>H', data[76:78])
|
self.nrecs, = unpack('>H', data[76:78])
|
||||||
record0 = self.record0 = self.record(0)
|
record0 = self.record0 = self.record(0)
|
||||||
self.encryption_type, = unpack('>H', record0[12:14])
|
self.encryption_type, = unpack('>H', record0[12:14])
|
||||||
codepage, = unpack('>I', record0[28:32])
|
codepage, = unpack('>I', record0[28:32])
|
||||||
self.codec = 'utf-8' if codepage == 65001 else 'cp1252'
|
self.codec = 'utf-8' if codepage == 65001 else 'cp1252'
|
||||||
image_base, = unpack('>I', record0[108:112])
|
image_base, = unpack('>I', record0[108:112])
|
||||||
flags, = unpack('>I', record0[128:132])
|
flags, = self.flags, = unpack('>I', record0[128:132])
|
||||||
have_exth = self.have_exth = (flags & 0x40) != 0
|
have_exth = self.have_exth = (flags & 0x40) != 0
|
||||||
self.cover_record = self.thumbnail_record = None
|
self.cover_record = self.thumbnail_record = None
|
||||||
self.timestamp = None
|
self.timestamp = None
|
||||||
|
|
||||||
|
self.pdbrecords = self.get_pdbrecords()
|
||||||
if not have_exth:
|
if not have_exth:
|
||||||
return
|
self.create_exth()
|
||||||
|
|
||||||
|
# Fetch timestamp, cover_record, thumbnail_record
|
||||||
|
self.fetchEXTHFields()
|
||||||
|
|
||||||
|
def fetchEXTHFields(self):
|
||||||
|
stream = self.stream
|
||||||
|
record0 = self.record0
|
||||||
|
|
||||||
|
# 20:24 = mobiHeaderLength, 16=PDBHeader size
|
||||||
exth_off = unpack('>I', record0[20:24])[0] + 16 + record0.start
|
exth_off = unpack('>I', record0[20:24])[0] + 16 + record0.start
|
||||||
|
image_base, = unpack('>I', record0[108:112])
|
||||||
|
|
||||||
|
# Fetch EXTH block
|
||||||
exth = self.exth = StreamSlicer(stream, exth_off, record0.stop)
|
exth = self.exth = StreamSlicer(stream, exth_off, record0.stop)
|
||||||
nitems, = unpack('>I', exth[8:12])
|
nitems, = unpack('>I', exth[8:12])
|
||||||
pos = 12
|
pos = 12
|
||||||
@ -109,6 +140,130 @@ class MetadataUpdater(object):
|
|||||||
rindex, = self.thumbnail_rindex, = unpack('>I', content)
|
rindex, = self.thumbnail_rindex, = unpack('>I', content)
|
||||||
self.thumbnail_record = self.record(rindex + image_base)
|
self.thumbnail_record = self.record(rindex + image_base)
|
||||||
|
|
||||||
|
def patch(self, off, new_record0):
|
||||||
|
# Save the current size of each record
|
||||||
|
record_sizes = [len(new_record0)]
|
||||||
|
for i in range(1,self.nrecs-1):
|
||||||
|
record_sizes.append(self.pdbrecords[i+1][0]-self.pdbrecords[i][0])
|
||||||
|
# And the last one
|
||||||
|
record_sizes.append(self.data.stop - self.pdbrecords[self.nrecs-1][0])
|
||||||
|
|
||||||
|
# pdbrecord[0] is the offset of record0. It will not change
|
||||||
|
# record1 offset will be offset of record0 + len(new_record0)
|
||||||
|
updated_pdbrecords = [self.pdbrecords[0][0]]
|
||||||
|
record0_offset = self.pdbrecords[0][0]
|
||||||
|
updated_offset = record0_offset + len(new_record0)
|
||||||
|
|
||||||
|
for i in range(1,self.nrecs-1):
|
||||||
|
updated_pdbrecords.append(updated_offset)
|
||||||
|
updated_offset += record_sizes[i]
|
||||||
|
# Update the last pdbrecord
|
||||||
|
updated_pdbrecords.append(updated_offset)
|
||||||
|
|
||||||
|
# Read in current records 1 to last
|
||||||
|
data_blocks = [new_record0]
|
||||||
|
for i in range(1,self.nrecs):
|
||||||
|
data_blocks.append(self.data[self.pdbrecords[i][0]:self.pdbrecords[i][0] + record_sizes[i]])
|
||||||
|
|
||||||
|
# Rewrite the stream
|
||||||
|
self.record0.update(data_blocks)
|
||||||
|
|
||||||
|
# Rewrite the pdbrecords
|
||||||
|
self.update_pdbrecords(updated_pdbrecords)
|
||||||
|
|
||||||
|
# Truncate if necessary
|
||||||
|
if (updated_pdbrecords[-1] + record_sizes[-1]) < self.data.stop:
|
||||||
|
self.data.truncate(updated_pdbrecords[-1] + record_sizes[-1])
|
||||||
|
else:
|
||||||
|
self.data.stop = updated_pdbrecords[-1] + record_sizes[-1]
|
||||||
|
|
||||||
|
def patchSection(self, section, new):
|
||||||
|
off = self.pdbrecords[section][0]
|
||||||
|
self.patch(off, new)
|
||||||
|
|
||||||
|
def create_exth(self, exth=None):
|
||||||
|
# Add an EXTH block to record 0, rewrite the stream
|
||||||
|
# self.hexdump(self.record0)
|
||||||
|
|
||||||
|
# Fetch the title
|
||||||
|
title_offset, = struct.unpack('>L', self.record0[0x54:0x58])
|
||||||
|
title_length, = struct.unpack('>L', self.record0[0x58:0x5c])
|
||||||
|
title_in_file, = struct.unpack('%ds' % (title_length), self.record0[title_offset:title_offset + title_length])
|
||||||
|
|
||||||
|
# Adjust length to accommodate PrimaryINDX if necessary
|
||||||
|
mobi_header_length, = unpack('>L', self.record0[0x14:0x18])
|
||||||
|
if mobi_header_length == 0xe4:
|
||||||
|
# Patch mobi_header_length to 0xE8
|
||||||
|
self.record0[0x17] = "\xe8"
|
||||||
|
self.record0[0xf4:0xf8] = pack('>L', 0xFFFFFFFF)
|
||||||
|
mobi_header_length = 0xe8
|
||||||
|
|
||||||
|
# Set EXTH flag (0x40)
|
||||||
|
self.record0[0x80:0x84] = pack('>L', self.flags|0x40)
|
||||||
|
|
||||||
|
if not exth:
|
||||||
|
# Construct an empty EXTH block
|
||||||
|
pad = '\0' * 4
|
||||||
|
exth = ['EXTH', pack('>II', 12, 0), pad]
|
||||||
|
exth = ''.join(exth)
|
||||||
|
|
||||||
|
# Update title_offset
|
||||||
|
self.record0[0x54:0x58] = pack('>L', 0x10 + mobi_header_length + len(exth))
|
||||||
|
|
||||||
|
# Create an updated Record0
|
||||||
|
new_record0 = StringIO()
|
||||||
|
new_record0.write(self.record0[:0x10 + mobi_header_length])
|
||||||
|
new_record0.write(exth)
|
||||||
|
new_record0.write(title_in_file)
|
||||||
|
|
||||||
|
# Pad to a 4-byte boundary
|
||||||
|
trail = len(new_record0.getvalue()) % 4
|
||||||
|
pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte
|
||||||
|
new_record0.write(pad)
|
||||||
|
|
||||||
|
#self.hexdump(new_record0.getvalue())
|
||||||
|
|
||||||
|
# Rebuild the stream, update the pdbrecords pointers
|
||||||
|
self.patchSection(0,new_record0.getvalue())
|
||||||
|
|
||||||
|
# Update record0
|
||||||
|
self.record0 = self.record(0)
|
||||||
|
|
||||||
|
def hexdump(self, src, length=16):
|
||||||
|
# Diagnostic
|
||||||
|
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
|
||||||
|
N=0; result=''
|
||||||
|
while src:
|
||||||
|
s,src = src[:length],src[length:]
|
||||||
|
hexa = ' '.join(["%02X"%ord(x) for x in s])
|
||||||
|
s = s.translate(FILTER)
|
||||||
|
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
|
||||||
|
N+=length
|
||||||
|
print result
|
||||||
|
|
||||||
|
def get_pdbrecords(self):
|
||||||
|
pdbrecords = []
|
||||||
|
for i in xrange(self.nrecs):
|
||||||
|
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.data[78+i*8:78+i*8+8])
|
||||||
|
flags, val = a1, a2<<16|a3<<8|a4
|
||||||
|
pdbrecords.append( [offset, flags, val] )
|
||||||
|
return pdbrecords
|
||||||
|
|
||||||
|
def update_pdbrecords(self, updated_pdbrecords):
|
||||||
|
for (i, pdbrecord) in enumerate(updated_pdbrecords):
|
||||||
|
self.data[78+i*8:78+i*8 + 4] = pack('>L',pdbrecord)
|
||||||
|
|
||||||
|
# Refresh local copy
|
||||||
|
self.pdbrecords = self.get_pdbrecords()
|
||||||
|
|
||||||
|
def dump_pdbrecords(self):
|
||||||
|
# Diagnostic
|
||||||
|
print "MetadataUpdater.dump_pdbrecords()"
|
||||||
|
print "%10s %10s %10s" % ("offset","flags","val")
|
||||||
|
for i in xrange(len(self.pdbrecords)):
|
||||||
|
pdbrecord = self.pdbrecords[i]
|
||||||
|
print "%10X %10X %10X" % (pdbrecord[0], pdbrecord[1], pdbrecord[2])
|
||||||
|
|
||||||
def record(self, n):
|
def record(self, n):
|
||||||
if n >= self.nrecs:
|
if n >= self.nrecs:
|
||||||
raise ValueError('non-existent record %r' % n)
|
raise ValueError('non-existent record %r' % n)
|
||||||
@ -120,6 +275,10 @@ class MetadataUpdater(object):
|
|||||||
return StreamSlicer(self.stream, start, stop)
|
return StreamSlicer(self.stream, start, stop)
|
||||||
|
|
||||||
def update(self, mi):
|
def update(self, mi):
|
||||||
|
if self.type != "BOOKMOBI":
|
||||||
|
raise MobiError("Setting metadata only supported for MOBI files of type 'BOOK'.\n"
|
||||||
|
"\tThis is a '%s' file of type '%s'" % (self.type[0:4], self.type[4:8]))
|
||||||
|
|
||||||
recs = []
|
recs = []
|
||||||
try:
|
try:
|
||||||
from calibre.ebooks.conversion.config import load_defaults
|
from calibre.ebooks.conversion.config import load_defaults
|
||||||
@ -142,7 +301,6 @@ class MetadataUpdater(object):
|
|||||||
if mi.tags:
|
if mi.tags:
|
||||||
subjects = '; '.join(mi.tags)
|
subjects = '; '.join(mi.tags)
|
||||||
recs.append((105, subjects.encode(self.codec, 'replace')))
|
recs.append((105, subjects.encode(self.codec, 'replace')))
|
||||||
|
|
||||||
if mi.pubdate:
|
if mi.pubdate:
|
||||||
recs.append((106, str(mi.pubdate).encode(self.codec, 'replace')))
|
recs.append((106, str(mi.pubdate).encode(self.codec, 'replace')))
|
||||||
elif mi.timestamp:
|
elif mi.timestamp:
|
||||||
@ -151,15 +309,16 @@ class MetadataUpdater(object):
|
|||||||
recs.append((106, self.timestamp))
|
recs.append((106, self.timestamp))
|
||||||
else:
|
else:
|
||||||
recs.append((106, str(datetime.now()).encode(self.codec, 'replace')))
|
recs.append((106, str(datetime.now()).encode(self.codec, 'replace')))
|
||||||
|
|
||||||
if self.cover_record is not None:
|
if self.cover_record is not None:
|
||||||
recs.append((201, pack('>I', self.cover_rindex)))
|
recs.append((201, pack('>I', self.cover_rindex)))
|
||||||
recs.append((203, pack('>I', 0)))
|
recs.append((203, pack('>I', 0)))
|
||||||
if self.thumbnail_record is not None:
|
if self.thumbnail_record is not None:
|
||||||
recs.append((202, pack('>I', self.thumbnail_rindex)))
|
recs.append((202, pack('>I', self.thumbnail_rindex)))
|
||||||
exth = StringIO()
|
|
||||||
if getattr(self, 'encryption_type', -1) != 0:
|
if getattr(self, 'encryption_type', -1) != 0:
|
||||||
raise MobiError('Setting metadata in DRMed MOBI files is not supported.')
|
raise MobiError('Setting metadata in DRMed MOBI files is not supported.')
|
||||||
|
|
||||||
|
exth = StringIO()
|
||||||
for code, data in recs:
|
for code, data in recs:
|
||||||
exth.write(pack('>II', code, len(data) + 8))
|
exth.write(pack('>II', code, len(data) + 8))
|
||||||
exth.write(data)
|
exth.write(data)
|
||||||
@ -168,17 +327,16 @@ class MetadataUpdater(object):
|
|||||||
pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte
|
pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte
|
||||||
exth = ['EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad]
|
exth = ['EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad]
|
||||||
exth = ''.join(exth)
|
exth = ''.join(exth)
|
||||||
title = (mi.title or _('Unknown')).encode(self.codec, 'replace')
|
|
||||||
if getattr(self, 'exth', None) is None:
|
if getattr(self, 'exth', None) is None:
|
||||||
raise MobiError('No existing EXTH record. Cannot update metadata.')
|
raise MobiError('No existing EXTH record. Cannot update metadata.')
|
||||||
title_off = (self.exth.start - self.record0.start) + len(exth)
|
|
||||||
title_len = len(title)
|
|
||||||
trail = len(self.exth) - len(exth) - len(title)
|
|
||||||
if trail < 0:
|
|
||||||
raise MobiError("Insufficient space to update metadata")
|
|
||||||
self.exth[:] = ''.join([exth, title, '\0' * trail])
|
|
||||||
self.record0[84:92] = pack('>II', title_off, title_len)
|
|
||||||
self.record0[92:96] = iana2mobi(mi.language)
|
self.record0[92:96] = iana2mobi(mi.language)
|
||||||
|
self.create_exth(exth)
|
||||||
|
|
||||||
|
# Fetch updated timestamp, cover_record, thumbnail_record
|
||||||
|
self.fetchEXTHFields()
|
||||||
|
|
||||||
if mi.cover_data[1] or mi.cover:
|
if mi.cover_data[1] or mi.cover:
|
||||||
try:
|
try:
|
||||||
data = mi.cover_data[1] if mi.cover_data[1] else open(mi.cover, 'rb').read()
|
data = mi.cover_data[1] if mi.cover_data[1] else open(mi.cover, 'rb').read()
|
||||||
|
@ -92,6 +92,7 @@ class MobiMLizer(object):
|
|||||||
def __call__(self, oeb, context):
|
def __call__(self, oeb, context):
|
||||||
oeb.logger.info('Converting XHTML to Mobipocket markup...')
|
oeb.logger.info('Converting XHTML to Mobipocket markup...')
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
|
self.opts = context
|
||||||
self.profile = profile = context.dest
|
self.profile = profile = context.dest
|
||||||
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
|
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
|
||||||
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
|
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
|
||||||
@ -114,7 +115,7 @@ class MobiMLizer(object):
|
|||||||
def mobimlize_spine(self):
|
def mobimlize_spine(self):
|
||||||
'Iterate over the spine and convert it to MOBIML'
|
'Iterate over the spine and convert it to MOBIML'
|
||||||
for item in self.oeb.spine:
|
for item in self.oeb.spine:
|
||||||
stylizer = Stylizer(item.data, item.href, self.oeb, self.profile)
|
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
|
||||||
body = item.data.find(XHTML('body'))
|
body = item.data.find(XHTML('body'))
|
||||||
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
|
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
|
||||||
nbody = etree.SubElement(nroot, XHTML('body'))
|
nbody = etree.SubElement(nroot, XHTML('body'))
|
||||||
@ -163,6 +164,8 @@ class MobiMLizer(object):
|
|||||||
parent = bstate.nested[-1] if bstate.nested else bstate.body
|
parent = bstate.nested[-1] if bstate.nested else bstate.body
|
||||||
indent = istate.indent
|
indent = istate.indent
|
||||||
left = istate.left
|
left = istate.left
|
||||||
|
if isinstance(indent, basestring):
|
||||||
|
indent = 0
|
||||||
if indent < 0 and abs(indent) < left:
|
if indent < 0 and abs(indent) < left:
|
||||||
left += indent
|
left += indent
|
||||||
indent = 0
|
indent = 0
|
||||||
|
@ -50,7 +50,7 @@ class MOBIOutput(OutputFormatPlugin):
|
|||||||
def check_for_masthead(self):
|
def check_for_masthead(self):
|
||||||
found = 'masthead' in self.oeb.guide
|
found = 'masthead' in self.oeb.guide
|
||||||
if not found:
|
if not found:
|
||||||
self.oeb.log.debug('No masthead found, generating default one...')
|
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
|
||||||
try:
|
try:
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
PILImage
|
PILImage
|
||||||
@ -65,6 +65,9 @@ class MOBIOutput(OutputFormatPlugin):
|
|||||||
id, href = self.oeb.manifest.generate('masthead', 'masthead')
|
id, href = self.oeb.manifest.generate('masthead', 'masthead')
|
||||||
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
|
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
|
||||||
self.oeb.guide.add('masthead', 'Masthead Image', href)
|
self.oeb.guide.add('masthead', 'Masthead Image', href)
|
||||||
|
else:
|
||||||
|
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
|
||||||
|
|
||||||
|
|
||||||
def dump_toc(self, toc) :
|
def dump_toc(self, toc) :
|
||||||
self.log( "\n >>> TOC contents <<<")
|
self.log( "\n >>> TOC contents <<<")
|
||||||
|
@ -573,6 +573,8 @@ class MobiReader(object):
|
|||||||
attrib[attr] = "%dpx"%int(nval)
|
attrib[attr] = "%dpx"%int(nval)
|
||||||
except:
|
except:
|
||||||
del attrib[attr]
|
del attrib[attr]
|
||||||
|
elif val.lower().endswith('%'):
|
||||||
|
del attrib[attr]
|
||||||
elif tag.tag == 'pre':
|
elif tag.tag == 'pre':
|
||||||
if not tag.text:
|
if not tag.text:
|
||||||
tag.tag = 'div'
|
tag.tag = 'div'
|
||||||
|
@ -619,7 +619,6 @@ class MobiWriter(object):
|
|||||||
self._oeb.log.warning('_generate_indexed_navpoints: Failed to generate index')
|
self._oeb.log.warning('_generate_indexed_navpoints: Failed to generate index')
|
||||||
# Zero out self._HTMLRecords, return False
|
# Zero out self._HTMLRecords, return False
|
||||||
self._HTMLRecords = []
|
self._HTMLRecords = []
|
||||||
#last_name = None
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
previousOffset = offset
|
previousOffset = offset
|
||||||
|
@ -760,6 +760,8 @@ class Manifest(object):
|
|||||||
|
|
||||||
def _parse_xml(self, data):
|
def _parse_xml(self, data):
|
||||||
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
|
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
|
||||||
|
if not data:
|
||||||
|
return None
|
||||||
parser = etree.XMLParser(recover=True)
|
parser = etree.XMLParser(recover=True)
|
||||||
try:
|
try:
|
||||||
return etree.fromstring(data, parser=parser)
|
return etree.fromstring(data, parser=parser)
|
||||||
@ -907,9 +909,15 @@ class Manifest(object):
|
|||||||
'content': '%s; charset=utf-8' % XHTML_NS})
|
'content': '%s; charset=utf-8' % XHTML_NS})
|
||||||
# Ensure has a <body/>
|
# Ensure has a <body/>
|
||||||
if not xpath(data, '/h:html/h:body'):
|
if not xpath(data, '/h:html/h:body'):
|
||||||
self.oeb.logger.warn(
|
body = xpath(data, '//h:body')
|
||||||
'File %r missing <body/> element' % self.href)
|
if body:
|
||||||
etree.SubElement(data, XHTML('body'))
|
body = body[0]
|
||||||
|
body.getparent().remove(body)
|
||||||
|
data.append(body)
|
||||||
|
else:
|
||||||
|
self.oeb.logger.warn(
|
||||||
|
'File %r missing <body/> element' % self.href)
|
||||||
|
etree.SubElement(data, XHTML('body'))
|
||||||
|
|
||||||
# Remove microsoft office markup
|
# Remove microsoft office markup
|
||||||
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
|
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
|
||||||
|
@ -1,99 +0,0 @@
|
|||||||
'''
|
|
||||||
Registry associating file extensions with Reader classes.
|
|
||||||
'''
|
|
||||||
from __future__ import with_statement
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
|
||||||
|
|
||||||
import sys, os, logging
|
|
||||||
from itertools import chain
|
|
||||||
import calibre
|
|
||||||
from calibre.ebooks.oeb.base import OEBError
|
|
||||||
from calibre.ebooks.oeb.reader import OEBReader
|
|
||||||
from calibre.ebooks.oeb.writer import OEBWriter
|
|
||||||
from calibre.ebooks.lit.reader import LitReader
|
|
||||||
from calibre.ebooks.lit.writer import LitWriter
|
|
||||||
from calibre.ebooks.mobi.reader import MobiReader
|
|
||||||
from calibre.ebooks.mobi.writer import MobiWriter
|
|
||||||
from calibre.ebooks.oeb.base import OEBBook
|
|
||||||
from calibre.ebooks.oeb.profile import Context
|
|
||||||
from calibre.utils.config import Config
|
|
||||||
|
|
||||||
__all__ = ['get_reader']
|
|
||||||
|
|
||||||
REGISTRY = {
|
|
||||||
'.opf': (OEBReader, None),
|
|
||||||
'.lit': (LitReader, LitWriter),
|
|
||||||
'.mobi': (MobiReader, MobiWriter),
|
|
||||||
}
|
|
||||||
|
|
||||||
def ReaderFactory(path):
|
|
||||||
if os.path.isdir(path):
|
|
||||||
return OEBReader
|
|
||||||
ext = os.path.splitext(path)[1].lower()
|
|
||||||
Reader = REGISTRY.get(ext, (None, None))[0]
|
|
||||||
if Reader is None:
|
|
||||||
raise OEBError('Unknown e-book file extension %r' % ext)
|
|
||||||
return Reader
|
|
||||||
|
|
||||||
def WriterFactory(path):
|
|
||||||
if os.path.isdir(path):
|
|
||||||
return OEBWriter
|
|
||||||
ext = os.path.splitext(path)[1].lower()
|
|
||||||
if not os.path.exists(path) and not ext:
|
|
||||||
return OEBWriter
|
|
||||||
Writer = REGISTRY.get(ext, (None, None))[1]
|
|
||||||
if Writer is None:
|
|
||||||
raise OEBError('Unknown e-book file extension %r' % ext)
|
|
||||||
return Writer
|
|
||||||
|
|
||||||
|
|
||||||
def option_parser(Reader, Writer):
|
|
||||||
cfg = Config('ebook-convert', _('Options to control e-book conversion.'))
|
|
||||||
Reader.config(cfg)
|
|
||||||
for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS):
|
|
||||||
Transform.config(cfg)
|
|
||||||
Writer.config(cfg)
|
|
||||||
parser = cfg.option_parser()
|
|
||||||
parser.add_option('--encoding', default=None,
|
|
||||||
help=_('Character encoding for input. Default is to auto detect.'))
|
|
||||||
parser.add_option('-o', '--output', default=None,
|
|
||||||
help=_('Output file. Default is derived from input filename.'))
|
|
||||||
parser.add_option('-p', '--pretty-print', action='store_true',
|
|
||||||
default=False, help=_('Produce more human-readable XML output.'))
|
|
||||||
parser.add_option('-v', '--verbose', default=0, action='count',
|
|
||||||
help=_('Useful for debugging.'))
|
|
||||||
return parser
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
|
||||||
if len(argv) < 3:
|
|
||||||
print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]")
|
|
||||||
return 1
|
|
||||||
inpath, outpath = argv[1], argv[2]
|
|
||||||
Reader = ReaderFactory(inpath)
|
|
||||||
Writer = WriterFactory(outpath)
|
|
||||||
parser = option_parser(Reader, Writer)
|
|
||||||
opts, args = parser.parse_args(argv[3:])
|
|
||||||
if len(args) != 0:
|
|
||||||
parser.print_help()
|
|
||||||
return 1
|
|
||||||
logger = logging.getLogger('ebook-convert')
|
|
||||||
calibre.setup_cli_handlers(logger, logging.DEBUG)
|
|
||||||
encoding = opts.encoding
|
|
||||||
pretty_print = opts.pretty_print
|
|
||||||
oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger)
|
|
||||||
context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE)
|
|
||||||
reader = Reader.generate(opts)
|
|
||||||
writer = Writer.generate(opts)
|
|
||||||
transforms = []
|
|
||||||
for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS):
|
|
||||||
transforms.append(Transform.generate(opts))
|
|
||||||
reader(oeb, inpath)
|
|
||||||
for transform in transforms:
|
|
||||||
transform(oeb, context)
|
|
||||||
writer(oeb, outpath)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
@ -110,16 +110,20 @@ class CSSSelector(etree.XPath):
|
|||||||
class Stylizer(object):
|
class Stylizer(object):
|
||||||
STYLESHEETS = WeakKeyDictionary()
|
STYLESHEETS = WeakKeyDictionary()
|
||||||
|
|
||||||
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'],
|
def __init__(self, tree, path, oeb, opts, profile=PROFILES['PRS505'],
|
||||||
extra_css='', user_css=''):
|
extra_css='', user_css=''):
|
||||||
self.oeb = oeb
|
self.oeb, self.opts = oeb, opts
|
||||||
self.profile = profile
|
self.profile = profile
|
||||||
self.logger = oeb.logger
|
self.logger = oeb.logger
|
||||||
item = oeb.manifest.hrefs[path]
|
item = oeb.manifest.hrefs[path]
|
||||||
basename = os.path.basename(path)
|
basename = os.path.basename(path)
|
||||||
cssname = os.path.splitext(basename)[0] + '.css'
|
cssname = os.path.splitext(basename)[0] + '.css'
|
||||||
stylesheets = [HTML_CSS_STYLESHEET]
|
stylesheets = [HTML_CSS_STYLESHEET]
|
||||||
head = xpath(tree, '/h:html/h:head')[0]
|
head = xpath(tree, '/h:html/h:head')
|
||||||
|
if head:
|
||||||
|
head = head[0]
|
||||||
|
else:
|
||||||
|
head = []
|
||||||
parser = cssutils.CSSParser(fetcher=self._fetch_css_file,
|
parser = cssutils.CSSParser(fetcher=self._fetch_css_file,
|
||||||
log=logging.getLogger('calibre.css'))
|
log=logging.getLogger('calibre.css'))
|
||||||
self.font_face_rules = []
|
self.font_face_rules = []
|
||||||
@ -249,6 +253,8 @@ class Stylizer(object):
|
|||||||
style.update(self._normalize_font(prop.cssValue))
|
style.update(self._normalize_font(prop.cssValue))
|
||||||
elif name == 'list-style':
|
elif name == 'list-style':
|
||||||
style.update(self._normalize_list_style(prop.cssValue))
|
style.update(self._normalize_list_style(prop.cssValue))
|
||||||
|
elif name == 'text-align':
|
||||||
|
style.update(self._normalize_text_align(prop.cssValue))
|
||||||
else:
|
else:
|
||||||
style[name] = prop.value
|
style[name] = prop.value
|
||||||
if 'font-size' in style:
|
if 'font-size' in style:
|
||||||
@ -306,6 +312,19 @@ class Stylizer(object):
|
|||||||
|
|
||||||
return style
|
return style
|
||||||
|
|
||||||
|
def _normalize_text_align(self, cssvalue):
|
||||||
|
style = {}
|
||||||
|
text = cssvalue.cssText
|
||||||
|
if text == 'inherit':
|
||||||
|
style['text-align'] = 'inherit'
|
||||||
|
else:
|
||||||
|
if text in ('left', 'justify'):
|
||||||
|
val = 'left' if self.opts.dont_justify else 'justify'
|
||||||
|
style['text-align'] = val
|
||||||
|
else:
|
||||||
|
style['text-align'] = text
|
||||||
|
return style
|
||||||
|
|
||||||
def _normalize_font(self, cssvalue):
|
def _normalize_font(self, cssvalue):
|
||||||
composition = ('font-style', 'font-variant', 'font-weight',
|
composition = ('font-style', 'font-variant', 'font-weight',
|
||||||
'font-size', 'line-height', 'font-family')
|
'font-size', 'line-height', 'font-family')
|
||||||
|
@ -141,7 +141,7 @@ class CSSFlattener(object):
|
|||||||
bs.append('text-align: '+ \
|
bs.append('text-align: '+ \
|
||||||
('left' if self.context.dont_justify else 'justify'))
|
('left' if self.context.dont_justify else 'justify'))
|
||||||
body.set('style', '; '.join(bs))
|
body.set('style', '; '.join(bs))
|
||||||
stylizer = Stylizer(html, item.href, self.oeb, profile,
|
stylizer = Stylizer(html, item.href, self.oeb, self.context, profile,
|
||||||
user_css=self.context.extra_css,
|
user_css=self.context.extra_css,
|
||||||
extra_css=css)
|
extra_css=css)
|
||||||
self.stylizers[item] = stylizer
|
self.stylizers[item] = stylizer
|
||||||
|
@ -33,6 +33,7 @@ class CaseMangler(object):
|
|||||||
def __call__(self, oeb, context):
|
def __call__(self, oeb, context):
|
||||||
oeb.logger.info('Applying case-transforming CSS...')
|
oeb.logger.info('Applying case-transforming CSS...')
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
|
self.opts = context
|
||||||
self.profile = context.source
|
self.profile = context.source
|
||||||
self.mangle_spine()
|
self.mangle_spine()
|
||||||
|
|
||||||
@ -44,7 +45,7 @@ class CaseMangler(object):
|
|||||||
relhref = item.relhref(href)
|
relhref = item.relhref(href)
|
||||||
etree.SubElement(html.find(XHTML('head')), XHTML('link'),
|
etree.SubElement(html.find(XHTML('head')), XHTML('link'),
|
||||||
rel='stylesheet', href=relhref, type=CSS_MIME)
|
rel='stylesheet', href=relhref, type=CSS_MIME)
|
||||||
stylizer = Stylizer(html, item.href, self.oeb, self.profile)
|
stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
|
||||||
self.mangle_elem(html.find(XHTML('body')), stylizer)
|
self.mangle_elem(html.find(XHTML('body')), stylizer)
|
||||||
|
|
||||||
def text_transform(self, transform, text):
|
def text_transform(self, transform, text):
|
||||||
|
@ -44,6 +44,7 @@ class SVGRasterizer(object):
|
|||||||
def __call__(self, oeb, context):
|
def __call__(self, oeb, context):
|
||||||
oeb.logger.info('Rasterizing SVG images...')
|
oeb.logger.info('Rasterizing SVG images...')
|
||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
|
self.opts = context
|
||||||
self.profile = context.dest
|
self.profile = context.dest
|
||||||
self.images = {}
|
self.images = {}
|
||||||
self.dataize_manifest()
|
self.dataize_manifest()
|
||||||
@ -102,7 +103,7 @@ class SVGRasterizer(object):
|
|||||||
def rasterize_spine(self):
|
def rasterize_spine(self):
|
||||||
for item in self.oeb.spine:
|
for item in self.oeb.spine:
|
||||||
html = item.data
|
html = item.data
|
||||||
stylizer = Stylizer(html, item.href, self.oeb, self.profile)
|
stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
|
||||||
self.rasterize_item(item, stylizer)
|
self.rasterize_item(item, stylizer)
|
||||||
|
|
||||||
def rasterize_item(self, item, stylizer):
|
def rasterize_item(self, item, stylizer):
|
||||||
|
@ -20,7 +20,7 @@ class Reader(FormatReader):
|
|||||||
|
|
||||||
if record0_size == 132:
|
if record0_size == 132:
|
||||||
self.reader = Reader132(header, stream, log, options)
|
self.reader = Reader132(header, stream, log, options)
|
||||||
elif record0_size == 202:
|
elif record0_size in (116, 202):
|
||||||
self.reader = Reader202(header, stream, log, options)
|
self.reader = Reader202(header, stream, log, options)
|
||||||
else:
|
else:
|
||||||
raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size)
|
raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size)
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Read content from ereader pdb file with a 202 byte header created by Makebook.
|
Read content from ereader pdb file with a 116 and 202 byte header created by Makebook.
|
||||||
'''
|
'''
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
@ -44,7 +44,7 @@ class Reader202(FormatReader):
|
|||||||
|
|
||||||
self.header_record = HeaderRecord(self.section_data(0))
|
self.header_record = HeaderRecord(self.section_data(0))
|
||||||
|
|
||||||
if self.header_record.version != 4:
|
if self.header_record.version not in (2, 4):
|
||||||
raise EreaderError('Unknown book version %i.' % self.header_record.version)
|
raise EreaderError('Unknown book version %i.' % self.header_record.version)
|
||||||
|
|
||||||
from calibre.ebooks.metadata.pdb import get_metadata
|
from calibre.ebooks.metadata.pdb import get_metadata
|
||||||
|
@ -42,7 +42,9 @@ class Writer(FormatWriter):
|
|||||||
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
|
pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
|
||||||
|
|
||||||
text, text_sizes = self._text(pml)
|
text, text_sizes = self._text(pml)
|
||||||
chapter_index = self._index_item(r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"', pml)
|
chapter_index = self._index_item(r'(?s)\\C(?P<val>[0-4)="(?P<text>.+?)"', pml)
|
||||||
|
chapter_index += self.index_item(r'(?s)\\X(?P<val>[0-4])(?P<text>.+?)\\X[0-4]', pml)
|
||||||
|
chapter_index += self.index_item(r'(?s)\\x(?P<text>.+?)\\x', pml)
|
||||||
link_index = self._index_item(r'(?s)\\Q="(?P<text>.+?)"', pml)
|
link_index = self._index_item(r'(?s)\\Q="(?P<text>.+?)"', pml)
|
||||||
images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
|
images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
|
||||||
metadata = [self._metadata(metadata)]
|
metadata = [self._metadata(metadata)]
|
||||||
|
@ -18,38 +18,11 @@ class Font(object):
|
|||||||
self.color = spec.get('color')
|
self.color = spec.get('color')
|
||||||
self.family = spec.get('family')
|
self.family = spec.get('family')
|
||||||
|
|
||||||
class Column(object):
|
class Element(object):
|
||||||
|
|
||||||
# A column contains an element is the element bulges out to
|
|
||||||
# the left or the right by at most HFUZZ*col width.
|
|
||||||
HFUZZ = 0.2
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.left = self.right = self.top = self.bottom = 0
|
self.starts_block = None
|
||||||
self.width = self.height = 0
|
self.block_style = None
|
||||||
self.elements = []
|
|
||||||
|
|
||||||
def add(self, elem):
|
|
||||||
if elem in self.elements: return
|
|
||||||
self.elements.append(elem)
|
|
||||||
self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
|
|
||||||
self.top = self.elements[0].top
|
|
||||||
self.bottom = self.elements[-1].bottom
|
|
||||||
self.left, self.right = sys.maxint, 0
|
|
||||||
for x in self:
|
|
||||||
self.left = min(self.left, x.left)
|
|
||||||
self.right = max(self.right, x.right)
|
|
||||||
self.width, self.height = self.right-self.left, self.bottom-self.top
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for x in self.elements:
|
|
||||||
yield x
|
|
||||||
|
|
||||||
def contains(self, elem):
|
|
||||||
return elem.left > self.left - self.HFUZZ*self.width and \
|
|
||||||
elem.right < self.right + self.HFUZZ*self.width
|
|
||||||
|
|
||||||
class Element(object):
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return self.id == other.id
|
return self.id == other.id
|
||||||
@ -60,17 +33,25 @@ class Element(object):
|
|||||||
class Image(Element):
|
class Image(Element):
|
||||||
|
|
||||||
def __init__(self, img, opts, log, idc):
|
def __init__(self, img, opts, log, idc):
|
||||||
|
Element.__init__(self)
|
||||||
self.opts, self.log = opts, log
|
self.opts, self.log = opts, log
|
||||||
self.id = idc.next()
|
self.id = idc.next()
|
||||||
self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \
|
self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \
|
||||||
map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth',
|
map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth',
|
||||||
'iheight')))
|
'iheight')))
|
||||||
self.src = img.get('src')
|
self.src = img.get('src')
|
||||||
|
self.bottom = self.top + self.height
|
||||||
|
self.right = self.left + self.width
|
||||||
|
|
||||||
|
def to_html(self):
|
||||||
|
return '<img src="%s" width="%dpx" height="%dpx"/>' % \
|
||||||
|
(self.src, int(self.width), int(self.height))
|
||||||
|
|
||||||
|
|
||||||
class Text(Element):
|
class Text(Element):
|
||||||
|
|
||||||
def __init__(self, text, font_map, opts, log, idc):
|
def __init__(self, text, font_map, opts, log, idc):
|
||||||
|
Element.__init__(self)
|
||||||
self.id = idc.next()
|
self.id = idc.next()
|
||||||
self.opts, self.log = opts, log
|
self.opts, self.log = opts, log
|
||||||
self.font_map = font_map
|
self.font_map = font_map
|
||||||
@ -89,8 +70,6 @@ class Text(Element):
|
|||||||
self.raw = text.text if text.text else u''
|
self.raw = text.text if text.text else u''
|
||||||
for x in text.iterchildren():
|
for x in text.iterchildren():
|
||||||
self.raw += etree.tostring(x, method='xml', encoding=unicode)
|
self.raw += etree.tostring(x, method='xml', encoding=unicode)
|
||||||
if x.tail:
|
|
||||||
self.raw += x.tail
|
|
||||||
self.average_character_width = self.width/len(self.text_as_string)
|
self.average_character_width = self.width/len(self.text_as_string)
|
||||||
|
|
||||||
def coalesce(self, other, page_number):
|
def coalesce(self, other, page_number):
|
||||||
@ -109,6 +88,9 @@ class Text(Element):
|
|||||||
self.average_character_width = (self.average_character_width +
|
self.average_character_width = (self.average_character_width +
|
||||||
other.average_character_width)/2.0
|
other.average_character_width)/2.0
|
||||||
|
|
||||||
|
def to_html(self):
|
||||||
|
return self.raw
|
||||||
|
|
||||||
class FontSizeStats(dict):
|
class FontSizeStats(dict):
|
||||||
|
|
||||||
def __init__(self, stats):
|
def __init__(self, stats):
|
||||||
@ -131,6 +113,11 @@ class Interval(object):
|
|||||||
right = min(self.right, other.right)
|
right = min(self.right, other.right)
|
||||||
return Interval(left, right)
|
return Interval(left, right)
|
||||||
|
|
||||||
|
def centered_in(self, parent):
|
||||||
|
left = abs(self.left - parent.left)
|
||||||
|
right = abs(self.right - parent.right)
|
||||||
|
return abs(left-right) < 3
|
||||||
|
|
||||||
def __nonzero__(self):
|
def __nonzero__(self):
|
||||||
return self.width > 0
|
return self.width > 0
|
||||||
|
|
||||||
@ -140,6 +127,213 @@ class Interval(object):
|
|||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
return hash('(%f,%f)'%self.left, self.right)
|
return hash('(%f,%f)'%self.left, self.right)
|
||||||
|
|
||||||
|
class Column(object):
|
||||||
|
|
||||||
|
# A column contains an element is the element bulges out to
|
||||||
|
# the left or the right by at most HFUZZ*col width.
|
||||||
|
HFUZZ = 0.2
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.left = self.right = self.top = self.bottom = 0
|
||||||
|
self.width = self.height = 0
|
||||||
|
self.elements = []
|
||||||
|
self.average_line_separation = 0
|
||||||
|
|
||||||
|
def add(self, elem):
|
||||||
|
if elem in self.elements: return
|
||||||
|
self.elements.append(elem)
|
||||||
|
self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom))
|
||||||
|
self.top = self.elements[0].top
|
||||||
|
self.bottom = self.elements[-1].bottom
|
||||||
|
self.left, self.right = sys.maxint, 0
|
||||||
|
for x in self:
|
||||||
|
self.left = min(self.left, x.left)
|
||||||
|
self.right = max(self.right, x.right)
|
||||||
|
self.width, self.height = self.right-self.left, self.bottom-self.top
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for x in self.elements:
|
||||||
|
yield x
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.elements)
|
||||||
|
|
||||||
|
def contains(self, elem):
|
||||||
|
return elem.left > self.left - self.HFUZZ*self.width and \
|
||||||
|
elem.right < self.right + self.HFUZZ*self.width
|
||||||
|
|
||||||
|
def collect_stats(self):
|
||||||
|
if len(self.elements) > 1:
|
||||||
|
gaps = [self.elements[i+1].top - self.elements[i].bottom for i in
|
||||||
|
range(0, len(self.elements)-1)]
|
||||||
|
self.average_line_separation = sum(gaps)/len(gaps)
|
||||||
|
for i, elem in enumerate(self.elements):
|
||||||
|
left_margin = elem.left - self.left
|
||||||
|
elem.indent_fraction = left_margin/self.width
|
||||||
|
elem.width_fraction = elem.width/self.width
|
||||||
|
if i == 0:
|
||||||
|
elem.top_gap_ratio = None
|
||||||
|
else:
|
||||||
|
elem.top_gap_ratio = (self.elements[i-1].bottom -
|
||||||
|
elem.top)/self.average_line_separation
|
||||||
|
|
||||||
|
def previous_element(self, idx):
|
||||||
|
if idx == 0:
|
||||||
|
return None
|
||||||
|
return self.elements[idx-1]
|
||||||
|
|
||||||
|
|
||||||
|
class Box(list):
|
||||||
|
|
||||||
|
def __init__(self, type='p'):
|
||||||
|
self.tag = type
|
||||||
|
|
||||||
|
def to_html(self):
|
||||||
|
ans = ['<%s>'%self.tag]
|
||||||
|
for elem in self:
|
||||||
|
if isinstance(elem, int):
|
||||||
|
ans.append('<a name="page_%d"/>'%elem)
|
||||||
|
else:
|
||||||
|
ans.append(elem.to_html()+' ')
|
||||||
|
ans.append('</%s>'%self.tag)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
class ImageBox(Box):
|
||||||
|
|
||||||
|
def __init__(self, img):
|
||||||
|
Box.__init__(self)
|
||||||
|
self.img = img
|
||||||
|
|
||||||
|
def to_html(self):
|
||||||
|
ans = ['<div style="text-align:center">']
|
||||||
|
ans.append(self.img.to_html())
|
||||||
|
if len(self) > 0:
|
||||||
|
ans.append('<br/>')
|
||||||
|
for elem in self:
|
||||||
|
if isinstance(elem, int):
|
||||||
|
ans.append('<a name="page_%d"/>'%elem)
|
||||||
|
else:
|
||||||
|
ans.append(elem.to_html()+' ')
|
||||||
|
ans.append('</div>')
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
class Region(object):
|
||||||
|
|
||||||
|
def __init__(self, opts, log):
|
||||||
|
self.opts, self.log = opts, log
|
||||||
|
self.columns = []
|
||||||
|
self.top = self.bottom = self.left = self.right = self.width = self.height = 0
|
||||||
|
|
||||||
|
def add(self, columns):
|
||||||
|
if not self.columns:
|
||||||
|
for x in sorted(columns, cmp=lambda x,y: cmp(x.left, y.left)):
|
||||||
|
self.columns.append(x)
|
||||||
|
else:
|
||||||
|
for i in range(len(columns)):
|
||||||
|
for elem in columns[i]:
|
||||||
|
self.columns[i].add(elem)
|
||||||
|
|
||||||
|
def contains(self, columns):
|
||||||
|
# TODO: handle unbalanced columns
|
||||||
|
if not self.columns:
|
||||||
|
return True
|
||||||
|
if len(columns) != len(self.columns):
|
||||||
|
return False
|
||||||
|
for i in range(len(columns)):
|
||||||
|
c1, c2 = self.columns[i], columns[i]
|
||||||
|
x1 = Interval(c1.left, c1.right)
|
||||||
|
x2 = Interval(c2.left, c2.right)
|
||||||
|
intersection = x1.intersection(x2)
|
||||||
|
base = min(x1.width, x2.width)
|
||||||
|
if intersection.width/base < 0.6:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_empty(self):
|
||||||
|
return len(self.columns) == 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def line_count(self):
|
||||||
|
max_lines = 0
|
||||||
|
for c in self.columns:
|
||||||
|
max_lines = max(max_lines, len(c))
|
||||||
|
return max_lines
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_small(self):
|
||||||
|
return self.line_count < 3
|
||||||
|
|
||||||
|
def absorb(self, singleton):
|
||||||
|
|
||||||
|
def most_suitable_column(elem):
|
||||||
|
mc, mw = None, 0
|
||||||
|
for c in self.columns:
|
||||||
|
i = Interval(c.left, c.right)
|
||||||
|
e = Interval(elem.left, elem.right)
|
||||||
|
w = i.intersection(e).width
|
||||||
|
if w > mw:
|
||||||
|
mc, mw = c, w
|
||||||
|
if mc is None:
|
||||||
|
self.log.warn('No suitable column for singleton',
|
||||||
|
elem.to_html())
|
||||||
|
mc = self.columns[0]
|
||||||
|
return mc
|
||||||
|
|
||||||
|
print
|
||||||
|
for c in singleton.columns:
|
||||||
|
for elem in c:
|
||||||
|
col = most_suitable_column(elem)
|
||||||
|
if self.opts.verbose > 3:
|
||||||
|
idx = self.columns.index(col)
|
||||||
|
self.log.debug(u'Absorbing singleton %s into column'%elem.to_html(),
|
||||||
|
idx)
|
||||||
|
col.add(elem)
|
||||||
|
|
||||||
|
|
||||||
|
def collect_stats(self):
|
||||||
|
for column in self.columns:
|
||||||
|
column.collect_stats()
|
||||||
|
self.average_line_separation = sum([x.average_line_separation for x in
|
||||||
|
self.columns])/float(len(self.columns))
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for x in self.columns:
|
||||||
|
yield x
|
||||||
|
|
||||||
|
def linearize(self):
|
||||||
|
self.elements = []
|
||||||
|
for x in self.columns:
|
||||||
|
self.elements.extend(x)
|
||||||
|
self.boxes = [Box()]
|
||||||
|
for i, elem in enumerate(self.elements):
|
||||||
|
if isinstance(elem, Image):
|
||||||
|
self.boxes.append(ImageBox(elem))
|
||||||
|
img = Interval(elem.left, elem.right)
|
||||||
|
for j in range(i+1, len(self.elements)):
|
||||||
|
t = self.elements[j]
|
||||||
|
if not isinstance(t, Text):
|
||||||
|
break
|
||||||
|
ti = Interval(t.left, t.right)
|
||||||
|
if not ti.centered_in(img):
|
||||||
|
break
|
||||||
|
self.boxes[-1].append(t)
|
||||||
|
self.boxes.append(Box())
|
||||||
|
else:
|
||||||
|
is_indented = False
|
||||||
|
if i+1 < len(self.elements):
|
||||||
|
indent_diff = elem.indent_fraction - \
|
||||||
|
self.elements[i+1].indent_fraction
|
||||||
|
if indent_diff > 0.05:
|
||||||
|
is_indented = True
|
||||||
|
if elem.top_gap_ratio > 1.2 or is_indented:
|
||||||
|
self.boxes.append(Box())
|
||||||
|
self.boxes[-1].append(elem)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Page(object):
|
class Page(object):
|
||||||
|
|
||||||
@ -151,6 +345,8 @@ class Page(object):
|
|||||||
# for them to be considered to be part of the same text fragment
|
# for them to be considered to be part of the same text fragment
|
||||||
LINE_FACTOR = 0.4
|
LINE_FACTOR = 0.4
|
||||||
|
|
||||||
|
# Multiplies the average line height when determining row height
|
||||||
|
# of a particular element to detect columns.
|
||||||
YFUZZ = 1.5
|
YFUZZ = 1.5
|
||||||
|
|
||||||
|
|
||||||
@ -215,21 +411,57 @@ class Page(object):
|
|||||||
self.texts.remove(match)
|
self.texts.remove(match)
|
||||||
|
|
||||||
def first_pass(self):
|
def first_pass(self):
|
||||||
|
'Sort page into regions and columns'
|
||||||
self.regions = []
|
self.regions = []
|
||||||
if not self.elements:
|
if not self.elements:
|
||||||
return
|
return
|
||||||
for i, x in enumerate(self.elements):
|
for i, x in enumerate(self.elements):
|
||||||
x.idx = i
|
x.idx = i
|
||||||
self.current_region = None
|
current_region = Region(self.opts, self.log)
|
||||||
processed = set([])
|
processed = set([])
|
||||||
for x in self.elements:
|
for x in self.elements:
|
||||||
if x in processed: continue
|
if x in processed: continue
|
||||||
elems = set(self.find_elements_in_row_of(x))
|
elems = set(self.find_elements_in_row_of(x))
|
||||||
columns = self.sort_into_columns(x, elems)
|
columns = self.sort_into_columns(x, elems)
|
||||||
processed.update(elems)
|
processed.update(elems)
|
||||||
columns
|
if not current_region.contains(columns):
|
||||||
|
self.regions.append(current_region)
|
||||||
|
current_region = Region(self.opts, self.log)
|
||||||
|
current_region.add(columns)
|
||||||
|
if not current_region.is_empty:
|
||||||
|
self.regions.append(current_region)
|
||||||
|
|
||||||
|
self.coalesce_regions()
|
||||||
|
|
||||||
|
def coalesce_regions(self):
|
||||||
|
# find contiguous sets of small regions
|
||||||
|
# absorb into a neighboring region (prefer the one with number of cols
|
||||||
|
# closer to the avg number of cols in the set, if equal use larger
|
||||||
|
# region)
|
||||||
|
# merge contiguous regions that can contain each other
|
||||||
|
absorbed = set([])
|
||||||
|
found = True
|
||||||
|
while found:
|
||||||
|
found = False
|
||||||
|
for i, region in enumerate(self.regions):
|
||||||
|
if region.is_small:
|
||||||
|
found = True
|
||||||
|
regions = []
|
||||||
|
for j in range(i+1, len(self.regions)):
|
||||||
|
if self.regions[j].is_small:
|
||||||
|
regions.append(self.regions[j])
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
prev = None if i == 0 else i-1
|
||||||
|
next = j if self.regions[j] not in regions else None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def sort_into_columns(self, elem, neighbors):
|
def sort_into_columns(self, elem, neighbors):
|
||||||
|
neighbors.add(elem)
|
||||||
|
neighbors = sorted(neighbors, cmp=lambda x,y:cmp(x.left, y.left))
|
||||||
|
if self.opts.verbose > 3:
|
||||||
|
self.log.debug('Neighbors:', [x.to_html() for x in neighbors])
|
||||||
columns = [Column()]
|
columns = [Column()]
|
||||||
columns[0].add(elem)
|
columns[0].add(elem)
|
||||||
for x in neighbors:
|
for x in neighbors:
|
||||||
@ -247,7 +479,7 @@ class Page(object):
|
|||||||
|
|
||||||
def find_elements_in_row_of(self, x):
|
def find_elements_in_row_of(self, x):
|
||||||
interval = Interval(x.top,
|
interval = Interval(x.top,
|
||||||
x.top + self.YFUZZ*(1+self.average_text_height))
|
x.top + self.YFUZZ*(self.average_text_height))
|
||||||
h_interval = Interval(x.left, x.right)
|
h_interval = Interval(x.left, x.right)
|
||||||
for y in self.elements[x.idx:x.idx+15]:
|
for y in self.elements[x.idx:x.idx+15]:
|
||||||
if y is not x:
|
if y is not x:
|
||||||
@ -258,6 +490,12 @@ class Page(object):
|
|||||||
x_interval.intersection(h_interval).width <= 0:
|
x_interval.intersection(h_interval).width <= 0:
|
||||||
yield y
|
yield y
|
||||||
|
|
||||||
|
def second_pass(self):
|
||||||
|
'Locate paragraph boundaries in each column'
|
||||||
|
for region in self.regions:
|
||||||
|
region.collect_stats()
|
||||||
|
region.linearize()
|
||||||
|
|
||||||
|
|
||||||
class PDFDocument(object):
|
class PDFDocument(object):
|
||||||
|
|
||||||
@ -287,6 +525,10 @@ class PDFDocument(object):
|
|||||||
for page in self.pages:
|
for page in self.pages:
|
||||||
page.document_font_stats = self.font_size_stats
|
page.document_font_stats = self.font_size_stats
|
||||||
page.first_pass()
|
page.first_pass()
|
||||||
|
page.second_pass()
|
||||||
|
|
||||||
|
self.linearize()
|
||||||
|
self.render()
|
||||||
|
|
||||||
def collect_font_statistics(self):
|
def collect_font_statistics(self):
|
||||||
self.font_size_stats = {}
|
self.font_size_stats = {}
|
||||||
@ -299,5 +541,43 @@ class PDFDocument(object):
|
|||||||
|
|
||||||
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
self.font_size_stats = FontSizeStats(self.font_size_stats)
|
||||||
|
|
||||||
|
def linearize(self):
|
||||||
|
self.elements = []
|
||||||
|
last_region = last_block = None
|
||||||
|
for page in self.pages:
|
||||||
|
page_number_inserted = False
|
||||||
|
for region in page.regions:
|
||||||
|
merge_first_block = last_region is not None and \
|
||||||
|
len(last_region.columns) == len(region.columns) and \
|
||||||
|
not hasattr(last_block, 'img')
|
||||||
|
for i, block in enumerate(region.boxes):
|
||||||
|
if merge_first_block:
|
||||||
|
merge_first_block = False
|
||||||
|
if not page_number_inserted:
|
||||||
|
last_block.append(page.number)
|
||||||
|
page_number_inserted = True
|
||||||
|
for elem in block:
|
||||||
|
last_block.append(elem)
|
||||||
|
else:
|
||||||
|
if not page_number_inserted:
|
||||||
|
block.insert(0, page.number)
|
||||||
|
page_number_inserted = True
|
||||||
|
self.elements.append(block)
|
||||||
|
last_block = block
|
||||||
|
last_region = region
|
||||||
|
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
html = ['<?xml version="1.0" encoding="UTF-8"?>',
|
||||||
|
'<html xmlns="http://www.w3.org/1999/xhtml">', '<head>',
|
||||||
|
'<title>PDF Reflow conversion</title>', '</head>', '<body>',
|
||||||
|
'<div>']
|
||||||
|
for elem in self.elements:
|
||||||
|
html.extend(elem.to_html())
|
||||||
|
html += ['</body>', '</html>']
|
||||||
|
with open('index.html', 'wb') as f:
|
||||||
|
f.write((u'\n'.join(html)).encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import StringIO
|
import StringIO
|
||||||
|
|
||||||
@ -170,6 +171,9 @@ class PML_HTMLizer(object):
|
|||||||
# &. It will display as &
|
# &. It will display as &
|
||||||
pml = pml.replace('&', '&')
|
pml = pml.replace('&', '&')
|
||||||
|
|
||||||
|
pml = re.sub(r'(?<=\\x)(?P<text>.*?)(?=\\x)', lambda match: '="%s"%s' % (self.strip_pml(match.group('text')), match.group('text')), pml)
|
||||||
|
pml = re.sub(r'(?<=\\X[0-4])(?P<text>.*?)(?=\\X[0-4])', lambda match: '="%s"%s' % (self.strip_pml(match.group('text')), match.group('text')), pml)
|
||||||
|
|
||||||
pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml)
|
pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml)
|
||||||
pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml)
|
pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml)
|
||||||
|
|
||||||
@ -177,6 +181,23 @@ class PML_HTMLizer(object):
|
|||||||
|
|
||||||
return pml
|
return pml
|
||||||
|
|
||||||
|
def strip_pml(self, pml):
|
||||||
|
pml = re.sub(r'\\C\d=".+*"', '', pml)
|
||||||
|
pml = re.sub(r'\\Fn=".+*"', '', pml)
|
||||||
|
pml = re.sub(r'\\Sd=".+*"', '', pml)
|
||||||
|
pml = re.sub(r'\\.=".+*"', '', pml)
|
||||||
|
pml = re.sub(r'\\X\d', '', pml)
|
||||||
|
pml = re.sub(r'\\S[pbd]', '', pml)
|
||||||
|
pml = re.sub(r'\\Fn', '', pml)
|
||||||
|
pml = re.sub(r'\\a\d\d\d', '', pml)
|
||||||
|
pml = re.sub(r'\\U\d\d\d\d', '', pml)
|
||||||
|
pml = re.sub(r'\\.', '', pml)
|
||||||
|
pml.replace('\r\n', ' ')
|
||||||
|
pml.replace('\n', ' ')
|
||||||
|
pml.replace('\r', ' ')
|
||||||
|
|
||||||
|
return pml
|
||||||
|
|
||||||
def cleanup_html(self, html):
|
def cleanup_html(self, html):
|
||||||
old = html
|
old = html
|
||||||
html = self.cleanup_html_remove_redundant(html)
|
html = self.cleanup_html_remove_redundant(html)
|
||||||
@ -198,14 +219,26 @@ class PML_HTMLizer(object):
|
|||||||
def start_line(self):
|
def start_line(self):
|
||||||
start = u''
|
start = u''
|
||||||
|
|
||||||
|
div = []
|
||||||
|
span = []
|
||||||
|
other = []
|
||||||
|
|
||||||
for key, val in self.state.items():
|
for key, val in self.state.items():
|
||||||
if val[0]:
|
if val[0]:
|
||||||
if key in self.STATES_VALUE_REQ:
|
if key in self.DIV_STATES:
|
||||||
start += self.STATES_TAGS[key][0] % val[1]
|
div.append((key, val[1]))
|
||||||
elif key in self.STATES_VALUE_REQ_2:
|
elif key in self.SPAN_STATES:
|
||||||
start += self.STATES_TAGS[key][0] % (val[1], val[1])
|
span.append((key, val[1]))
|
||||||
else:
|
else:
|
||||||
start += self.STATES_TAGS[key][0]
|
other.append((key, val[1]))
|
||||||
|
|
||||||
|
for key, val in other+div+span:
|
||||||
|
if key in self.STATES_VALUE_REQ:
|
||||||
|
start += self.STATES_TAGS[key][0] % val
|
||||||
|
elif key in self.STATES_VALUE_REQ_2:
|
||||||
|
start += self.STATES_TAGS[key][0] % (val, val)
|
||||||
|
else:
|
||||||
|
start += self.STATES_TAGS[key][0]
|
||||||
|
|
||||||
return u'<p>%s' % start
|
return u'<p>%s' % start
|
||||||
|
|
||||||
@ -490,9 +523,9 @@ class PML_HTMLizer(object):
|
|||||||
if c == '\\':
|
if c == '\\':
|
||||||
c = line.read(1)
|
c = line.read(1)
|
||||||
|
|
||||||
if c in 'xqcrtTiIuobBlk':
|
if c in 'qcrtTiIuobBlk':
|
||||||
text = self.process_code(c, line)
|
text = self.process_code(c, line)
|
||||||
elif c in 'FSX':
|
elif c in 'FS':
|
||||||
l = line.read(1)
|
l = line.read(1)
|
||||||
if '%s%s' % (c, l) == 'Fn':
|
if '%s%s' % (c, l) == 'Fn':
|
||||||
text = self.process_code('Fn', line, 'fn')
|
text = self.process_code('Fn', line, 'fn')
|
||||||
@ -502,8 +535,24 @@ class PML_HTMLizer(object):
|
|||||||
text = self.process_code('SB', line)
|
text = self.process_code('SB', line)
|
||||||
elif '%s%s' % (c, l) == 'Sd':
|
elif '%s%s' % (c, l) == 'Sd':
|
||||||
text = self.process_code('Sd', line, 'sb')
|
text = self.process_code('Sd', line, 'sb')
|
||||||
|
elif c in 'xXC':
|
||||||
|
# The PML was modified eariler so x and X put the text
|
||||||
|
# inside of ="" so we don't have do special processing
|
||||||
|
# for C.
|
||||||
|
t = ''
|
||||||
|
if c in 'XC':
|
||||||
|
level = line.read(1)
|
||||||
|
id = 'pml_toc-%s' % len(self.toc)
|
||||||
|
value = self.code_value(line)
|
||||||
|
if c == 'x':
|
||||||
|
t = self.process_code(c, line)
|
||||||
|
elif c == 'X':
|
||||||
|
t = self.process_code('%s%s' % (c, level), line)
|
||||||
|
if not value or value == '':
|
||||||
|
text = t
|
||||||
else:
|
else:
|
||||||
text = self.process_code('%s%s' % (c, l), line)
|
self.toc.add_item(os.path.basename(self.file_name), id, value)
|
||||||
|
text = '<span id="%s"></span>%s' % (id, t)
|
||||||
elif c == 'm':
|
elif c == 'm':
|
||||||
empty = False
|
empty = False
|
||||||
src = self.code_value(line)
|
src = self.code_value(line)
|
||||||
@ -515,11 +564,6 @@ class PML_HTMLizer(object):
|
|||||||
elif c == 'p':
|
elif c == 'p':
|
||||||
empty = False
|
empty = False
|
||||||
text = '<br /><br style="page-break-after: always;" />'
|
text = '<br /><br style="page-break-after: always;" />'
|
||||||
elif c == 'C':
|
|
||||||
line.read(1)
|
|
||||||
id = 'pml_toc-%s' % len(self.toc)
|
|
||||||
self.toc.add_item(self.file_name, id, self.code_value(line))
|
|
||||||
text = '<span id="%s"></span>' % id
|
|
||||||
elif c == 'n':
|
elif c == 'n':
|
||||||
pass
|
pass
|
||||||
elif c == 'w':
|
elif c == 'w':
|
||||||
|