Merging trunk
@ -40,6 +40,7 @@ recipes/.gitignore
|
|||||||
recipes/README.md
|
recipes/README.md
|
||||||
recipes/icon_checker.py
|
recipes/icon_checker.py
|
||||||
recipes/readme_updater.py
|
recipes/readme_updater.py
|
||||||
|
recipes/garfield.recipe
|
||||||
recipes/katalog_egazeciarz.recipe
|
recipes/katalog_egazeciarz.recipe
|
||||||
recipes/tv_axnscifi.recipe
|
recipes/tv_axnscifi.recipe
|
||||||
recipes/tv_comedycentral.recipe
|
recipes/tv_comedycentral.recipe
|
||||||
@ -63,6 +64,7 @@ recipes/tv_tvppolonia.recipe
|
|||||||
recipes/tv_tvpuls.recipe
|
recipes/tv_tvpuls.recipe
|
||||||
recipes/tv_viasathistory.recipe
|
recipes/tv_viasathistory.recipe
|
||||||
recipes/icons/katalog_egazeciarz.png
|
recipes/icons/katalog_egazeciarz.png
|
||||||
|
recipes/icons/garfield.png
|
||||||
recipes/icons/tv_axnscifi.png
|
recipes/icons/tv_axnscifi.png
|
||||||
recipes/icons/tv_comedycentral.png
|
recipes/icons/tv_comedycentral.png
|
||||||
recipes/icons/tv_discoveryscience.png
|
recipes/icons/tv_discoveryscience.png
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=2:sw=2:sta:et:sts=2:ai
|
||||||
# Each release can have new features and bug fixes. Each of which
|
# Each release can have new features and bug fixes. Each of which
|
||||||
# must have a title and can optionally have linked tickets and a description.
|
# must have a title and can optionally have linked tickets and a description.
|
||||||
# In addition they can have a type field which defaults to minor, but should be major
|
# In addition they can have a type field which defaults to minor, but should be major
|
||||||
@ -19,6 +20,101 @@
|
|||||||
# new recipes:
|
# new recipes:
|
||||||
# - title:
|
# - title:
|
||||||
|
|
||||||
|
- version: 0.9.26
|
||||||
|
date: 2013-04-05
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "PDF Output: Allow using templates to create arbitrary headers and footers. Look under PDF Output in the conversion dialog for this feature."
|
||||||
|
|
||||||
|
- title: "ToC Editor: Allow generating the ToC directly from individual files inside the ebook. Useful for EPUBs that have individual chapters in single files."
|
||||||
|
tickets: [1163520]
|
||||||
|
|
||||||
|
- title: "ToC Editor: Add buttons to indent/unindent the current entry"
|
||||||
|
|
||||||
|
- title: "ToC Editor: Right-click menu to perform various useful actions on entries in the ToC"
|
||||||
|
|
||||||
|
- title: "Column icons: Allow use of wide images as column icons"
|
||||||
|
|
||||||
|
- title: "Add USB ids for the Palm Pre2 and Samsung Galaxy phone to the device drivers"
|
||||||
|
tickets: [1162293,1163115]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "PDF Output: Fix generating page numbers causing links to not work."
|
||||||
|
tickets: [1162573]
|
||||||
|
|
||||||
|
- title: "Wrong filename output in error message when 'Guide reference not found'"
|
||||||
|
tickets: [1163659]
|
||||||
|
|
||||||
|
- title: "Get Books: Update Amazon, Barnes & Noble, Waterstones and Gutenberg store plugins for website change"
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix 1 pixel wide left and top margins on the cover page for some PDF conversions due to incorrect rounding."
|
||||||
|
tickets: [1162054]
|
||||||
|
|
||||||
|
- title: "ToC Editor: Fix drag and drop of multiple items resulting in the dropped items being in random order sometimes."
|
||||||
|
tickets: [1161999]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Financial Times UK
|
||||||
|
- Sing Tao Daily
|
||||||
|
- Apple Daily
|
||||||
|
- A List Apart
|
||||||
|
- Business Week
|
||||||
|
- Harpers printed edition
|
||||||
|
- Harvard Business Review
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: AM730
|
||||||
|
author: Eddie Lau
|
||||||
|
|
||||||
|
- title: Arret sur images
|
||||||
|
author: Francois D
|
||||||
|
|
||||||
|
- title: Diario de Noticias
|
||||||
|
author: Jose Pinto
|
||||||
|
|
||||||
|
- version: 0.9.25
|
||||||
|
date: 2013-03-29
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Automatic adding: When checking for duplicates is enabled, use the same duplicates found dialog as is used during manual adding."
|
||||||
|
tickets: [1160914]
|
||||||
|
|
||||||
|
- title: "ToC Editor: Allow searching to find a location quickly when browsing through the book to select a location for a ToC item"
|
||||||
|
|
||||||
|
- title: "ToC Editor: Add a button to quickly flatten the entire table of contents"
|
||||||
|
|
||||||
|
- title: "Conversion: When converting a single book to EPUB or AZW3, add an option to automatically launch the Table of Contents editor after the conversion completes. Found under the Table of Contents section of the conversion dialog."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "calibredb: Nicer error messages when user provides invalid input"
|
||||||
|
tickets: [1160452,1160631]
|
||||||
|
|
||||||
|
- title: "News download: Always use the .jpg extension for jpeg images as apparently Moon+ Reader cannot handle .jpeg"
|
||||||
|
|
||||||
|
- title: "Fix Book Details popup keyboard navigation doesn't work on a Mac"
|
||||||
|
tickets: [1159610]
|
||||||
|
|
||||||
|
- title: "Fix a regression that caused the case of the book files to not be changed when changing the case of the title/author on case insensitive filesystems"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- RTE news
|
||||||
|
- Various Polish news sources
|
||||||
|
- Psychology Today
|
||||||
|
- Foreign Affairs
|
||||||
|
- History Today
|
||||||
|
- Harpers Magazine (printed edition)
|
||||||
|
- Business Week Magazine
|
||||||
|
- The Hindu
|
||||||
|
- Irish Times
|
||||||
|
- Le Devoir
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Fortune Magazine
|
||||||
|
author: Rick Shang
|
||||||
|
|
||||||
|
- title: Eclipse Online
|
||||||
|
author: Jim DeVona
|
||||||
|
|
||||||
- version: 0.9.24
|
- version: 0.9.24
|
||||||
date: 2013-03-22
|
date: 2013-03-22
|
||||||
|
|
||||||
|
@ -750,8 +750,61 @@ If this property is detected by |app|, the following custom properties are recog
|
|||||||
opf.series
|
opf.series
|
||||||
opf.seriesindex
|
opf.seriesindex
|
||||||
|
|
||||||
In addition to this, you can specify the picture to use as the cover by naming it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no picture with this name is found, the 'smart' method is used.
|
In addition to this, you can specify the picture to use as the cover by naming
|
||||||
As the cover detection might result in double covers in certain output formats, the process will remove the paragraph (only if the only content is the cover!) from the document. But this works only with the named picture!
|
it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no
|
||||||
|
picture with this name is found, the 'smart' method is used. As the cover
|
||||||
|
detection might result in double covers in certain output formats, the process
|
||||||
|
will remove the paragraph (only if the only content is the cover!) from the
|
||||||
|
document. But this works only with the named picture!
|
||||||
|
|
||||||
To disable cover detection you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes in advanced mode.
|
To disable cover detection you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes in advanced mode.
|
||||||
|
|
||||||
|
Converting to PDF
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The first, most important, setting to decide on when converting to PDF is the page
|
||||||
|
size. By default, |app| uses a page size defined by the current
|
||||||
|
:guilabel:`Output profile`. So if your output profile is set to Kindle, |app|
|
||||||
|
will create a PDF with page size suitable for viewing on the small kindle
|
||||||
|
screen. However, if you view this PDF file on a computer screen, then it will
|
||||||
|
appear to have too large fonts. To create "normal" sized PDFs, use the override
|
||||||
|
page size option under :guilabel:`PDF Output` in the conversion dialog.
|
||||||
|
|
||||||
|
You can insert arbitrary headers and footers on each page of the PDF by
|
||||||
|
specifying header and footer templates. Templates are just snippets of HTML
|
||||||
|
code that get rendered in the header and footer locations. For example, to
|
||||||
|
display page numbers centered at the bottom of every page, in green, use the following
|
||||||
|
footer template::
|
||||||
|
|
||||||
|
<p style="text-align:center; color:green">Page _PAGENUM_</p>
|
||||||
|
|
||||||
|
|app| will automatically replace _PAGENUM_ with the current page number. You
|
||||||
|
can even put different content on even and odd pages, for example the following
|
||||||
|
header template will show the title on odd pages and the author on even pages::
|
||||||
|
|
||||||
|
<p style="text-align:right"><span class="even_page">_AUTHOR_</span><span class="odd_page"><i>_TITLE_</i></span></p>
|
||||||
|
|
||||||
|
|app| will automatically replace _TITLE_ and _AUTHOR_ with the title and author
|
||||||
|
of the document being converted. You can also display text at the left and
|
||||||
|
right edges and change the font size, as demonstrated with this header
|
||||||
|
template::
|
||||||
|
|
||||||
|
<div style="font-size:x-small"><p style="float:left">_TITLE_</p><p style="float:right;"><i>_AUTHOR_</i></p></div>
|
||||||
|
|
||||||
|
This will display the title at the left and the author at the right, in a font
|
||||||
|
size smaller than the main text.
|
||||||
|
|
||||||
|
Finally, you can also use the current section in templates, as shown below::
|
||||||
|
|
||||||
|
<p style="text-align:right">_SECTION_</p>
|
||||||
|
|
||||||
|
_SECTION_ is replaced by whatever the name of the current section is. These
|
||||||
|
names are taken from the metadata Table of Contents in the document (the PDF
|
||||||
|
Outline). If the document has no table of contents then it will be replaced by
|
||||||
|
empty text. If a single PDF page has multiple sections, the first section on
|
||||||
|
the page will be used.
|
||||||
|
|
||||||
|
.. note:: When adding headers and footers make sure you set the page top and
|
||||||
|
bottom margins to large enough values, under the Page Setup section of the
|
||||||
|
conversion dialog.
|
||||||
|
|
||||||
|
@ -129,11 +129,11 @@ tool that always produces valid EPUBs, |app| is not for you.
|
|||||||
|
|
||||||
How do I use some of the advanced features of the conversion tools?
|
How do I use some of the advanced features of the conversion tools?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
You can get help on any individual feature of the converters by mousing over
|
You can get help on any individual feature of the converters by mousing over
|
||||||
it in the GUI or running ``ebook-convert dummy.html .epub -h`` at a terminal.
|
it in the GUI or running ``ebook-convert dummy.html .epub -h`` at a terminal.
|
||||||
A good place to start is to look at the following demo files that demonstrate
|
A good place to start is to look at the following demo file that demonstrates
|
||||||
some of the advanced features:
|
some of the advanced features
|
||||||
* `html-demo.zip <http://calibre-ebook.com/downloads/html-demo.zip>`_
|
`html-demo.zip <http://calibre-ebook.com/downloads/html-demo.zip>`_
|
||||||
|
|
||||||
|
|
||||||
Device Integration
|
Device Integration
|
||||||
@ -647,12 +647,17 @@ computers. Run |app| on a single computer and access it via the Content Server
|
|||||||
or a Remote Desktop solution.
|
or a Remote Desktop solution.
|
||||||
|
|
||||||
If you must share the actual library, use a file syncing tool like
|
If you must share the actual library, use a file syncing tool like
|
||||||
DropBox or rsync or Microsoft SkyDrive instead of a networked drive. Even with
|
DropBox or rsync or Microsoft SkyDrive instead of a networked drive. If you are
|
||||||
these tools there is danger of data corruption/loss, so only do this if you are
|
using a file-syncing tool it is **essential** that you make sure that both
|
||||||
willing to live with that risk. In particular, be aware that **Google Drive**
|
|app| and the file syncing tool do not try to access the |app| library at the
|
||||||
is incompatible with |app|, if you put your |app| library in Google Drive, you
|
same time. In other words, **do not** run the file syncing tool and |app| at
|
||||||
*will* suffer data loss. See
|
the same time.
|
||||||
`this thread <http://www.mobileread.com/forums/showthread.php?t=205581>`_ for details.
|
|
||||||
|
Even with these tools there is danger of data corruption/loss, so only do this
|
||||||
|
if you are willing to live with that risk. In particular, be aware that
|
||||||
|
**Google Drive** is incompatible with |app|, if you put your |app| library in
|
||||||
|
Google Drive, **you will suffer data loss**. See `this thread
|
||||||
|
<http://www.mobileread.com/forums/showthread.php?t=205581>`_ for details.
|
||||||
|
|
||||||
Content From The Web
|
Content From The Web
|
||||||
---------------------
|
---------------------
|
||||||
|
@ -66,4 +66,3 @@ class Adventure_zone(BasicNewsRecipe):
|
|||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href']=self.index + a['href']
|
a['href']=self.index + a['href']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
290
recipes/am730.recipe
Normal file
@ -0,0 +1,290 @@
|
|||||||
|
# vim:fileencoding=UTF-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Eddie Lau'
|
||||||
|
__Date__ = ''
|
||||||
|
__HiResImg__ = True
|
||||||
|
|
||||||
|
'''
|
||||||
|
Change Log:
|
||||||
|
2013/03/30 -- first version
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre import (__appname__, force_unicode, strftime)
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
|
import os, datetime, re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from contextlib import nested
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
|
class AppleDaily(BasicNewsRecipe):
|
||||||
|
title = u'AM730'
|
||||||
|
__author__ = 'Eddie Lau'
|
||||||
|
publisher = 'AM730'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = False
|
||||||
|
language = 'zh'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
auto_cleanup = False
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
description = 'http://www.am730.com.hk'
|
||||||
|
category = 'Chinese, News, Hong Kong'
|
||||||
|
masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
|
||||||
|
|
||||||
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
|
||||||
|
dict(name='div', attrs={'class':'thecontent wordsnap'}),
|
||||||
|
dict(name='a', attrs={'class':'lightboximg'})]
|
||||||
|
remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
|
||||||
|
dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
|
||||||
|
|
||||||
|
def get_dtlocal(self):
|
||||||
|
dt_utc = datetime.datetime.utcnow()
|
||||||
|
# convert UTC to local hk time - at HKT 6am, all news are available
|
||||||
|
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
|
||||||
|
|
||||||
|
def get_fetchdate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
|
def get_fetchformatteddate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
|
def get_fetchday(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
|
# Note: does not work with custom date given by __Date__
|
||||||
|
def get_weekday(self):
|
||||||
|
return self.get_dtlocal().weekday()
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
picdiv = soup.find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,picdiv['src'])
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
soup = self.index_to_soup('http://www.am730.com.hk/')
|
||||||
|
ul = soup.find(attrs={'class':'nav-section'})
|
||||||
|
sectionList = []
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
|
||||||
|
title = li.find('a').get('title', False).strip()
|
||||||
|
sectionList.append((title, a))
|
||||||
|
for title, url in sectionList:
|
||||||
|
articles = self.parse_section(url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def parse_section(self, url):
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
|
||||||
|
current_articles = []
|
||||||
|
for item in items:
|
||||||
|
a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
|
||||||
|
articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
|
||||||
|
current_articles.append({'title': title, 'url': articlelink, 'description': description})
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
multia = soup.findAll('a')
|
||||||
|
for a in multia:
|
||||||
|
if not (a == None):
|
||||||
|
image = a.find('img')
|
||||||
|
if not (image == None):
|
||||||
|
if __HiResImg__:
|
||||||
|
image['src'] = image.get('src').replace('/thumbs/', '/')
|
||||||
|
caption = image.get('alt')
|
||||||
|
tag = Tag(soup, "photo", [])
|
||||||
|
tag2 = Tag(soup, "photocaption", [])
|
||||||
|
tag.insert(0, image)
|
||||||
|
if not caption == None:
|
||||||
|
tag2.insert(0, caption)
|
||||||
|
tag.insert(1, tag2)
|
||||||
|
a.replaceWith(tag)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def create_opf(self, feeds, dir=None):
|
||||||
|
if dir is None:
|
||||||
|
dir = self.output_dir
|
||||||
|
title = self.short_title()
|
||||||
|
if self.output_profile.periodical_date_in_title:
|
||||||
|
title += strftime(self.timefmt)
|
||||||
|
mi = MetaInformation(title, [__appname__])
|
||||||
|
mi.publisher = __appname__
|
||||||
|
mi.author_sort = __appname__
|
||||||
|
if self.publication_type:
|
||||||
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
|
mi.timestamp = nowf()
|
||||||
|
article_titles, aseen = [], set()
|
||||||
|
for f in feeds:
|
||||||
|
for a in f:
|
||||||
|
if a.title and a.title not in aseen:
|
||||||
|
aseen.add(a.title)
|
||||||
|
article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
|
|
||||||
|
mi.comments = self.description
|
||||||
|
if not isinstance(mi.comments, unicode):
|
||||||
|
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
'\n\n'.join(article_titles))
|
||||||
|
|
||||||
|
language = canonicalize_lang(self.language)
|
||||||
|
if language is not None:
|
||||||
|
mi.language = language
|
||||||
|
# This one affects the pub date shown in kindle title
|
||||||
|
#mi.pubdate = nowf()
|
||||||
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
|
opf = OPFCreator(dir, mi)
|
||||||
|
# Add mastheadImage entry to <guide> section
|
||||||
|
mp = getattr(self, 'masthead_path', None)
|
||||||
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
|
# Get cover
|
||||||
|
cpath = getattr(self, 'cover_path', None)
|
||||||
|
if cpath is None:
|
||||||
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
|
if self.default_cover(pf):
|
||||||
|
cpath = pf.name
|
||||||
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
|
||||||
|
# Get masthead
|
||||||
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
|
manifest.append(mpath)
|
||||||
|
|
||||||
|
opf.create_manifest_from_files_in(manifest)
|
||||||
|
for mani in opf.manifest:
|
||||||
|
if mani.path.endswith('.ncx'):
|
||||||
|
mani.id = 'ncx'
|
||||||
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
|
entries = ['index.html']
|
||||||
|
toc = TOC(base_path=dir)
|
||||||
|
self.play_order_counter = 0
|
||||||
|
self.play_order_map = {}
|
||||||
|
|
||||||
|
|
||||||
|
def feed_index(num, parent):
|
||||||
|
f = feeds[num]
|
||||||
|
for j, a in enumerate(f):
|
||||||
|
if getattr(a, 'downloaded', False):
|
||||||
|
adir = 'feed_%d/article_%d/'%(num, j)
|
||||||
|
auth = a.author
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
|
desc = a.text_summary
|
||||||
|
if not desc:
|
||||||
|
desc = None
|
||||||
|
else:
|
||||||
|
desc = self.description_limiter(desc)
|
||||||
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
|
entries.append('%sindex.html'%adir)
|
||||||
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
|
if po is None:
|
||||||
|
self.play_order_counter += 1
|
||||||
|
po = self.play_order_counter
|
||||||
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
|
a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth,
|
||||||
|
description=desc, toc_thumbnail=tt)
|
||||||
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
|
for sp in a.sub_pages:
|
||||||
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
|
relp = sp[len(prefix):]
|
||||||
|
entries.append(relp.replace(os.sep, '/'))
|
||||||
|
last = sp
|
||||||
|
|
||||||
|
if os.path.exists(last):
|
||||||
|
with open(last, 'rb') as fi:
|
||||||
|
src = fi.read().decode('utf-8')
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
body = soup.find('body')
|
||||||
|
if body is not None:
|
||||||
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
|
not self.has_single_feed,
|
||||||
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
|
center=self.center_navbar)
|
||||||
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
|
body.insert(len(body.contents), elem)
|
||||||
|
with open(last, 'wb') as fi:
|
||||||
|
fi.write(unicode(soup).encode('utf-8'))
|
||||||
|
if len(feeds) == 0:
|
||||||
|
raise Exception('All feeds are empty, aborting.')
|
||||||
|
|
||||||
|
if len(feeds) > 1:
|
||||||
|
for i, f in enumerate(feeds):
|
||||||
|
entries.append('feed_%d/index.html'%i)
|
||||||
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
|
if po is None:
|
||||||
|
self.play_order_counter += 1
|
||||||
|
po = self.play_order_counter
|
||||||
|
auth = getattr(f, 'author', None)
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
|
desc = getattr(f, 'description', None)
|
||||||
|
if not desc:
|
||||||
|
desc = None
|
||||||
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
|
else:
|
||||||
|
entries.append('feed_%d/index.html'%0)
|
||||||
|
feed_index(0, toc)
|
||||||
|
|
||||||
|
for i, p in enumerate(entries):
|
||||||
|
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||||
|
opf.create_spine(entries)
|
||||||
|
opf.set_toc(toc)
|
||||||
|
|
||||||
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
@ -1,161 +1,275 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# vim:fileencoding=UTF-8
|
||||||
import re
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Eddie Lau'
|
||||||
|
__Date__ = ''
|
||||||
|
|
||||||
|
from calibre import (__appname__, force_unicode, strftime)
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
|
import os, datetime, re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from contextlib import nested
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
class AppleDaily(BasicNewsRecipe):
|
class AppleDaily(BasicNewsRecipe):
|
||||||
|
title = u'蘋果日報 (香港)'
|
||||||
title = u'蘋果日報'
|
__author__ = 'Eddie Lau'
|
||||||
__author__ = u'蘋果日報'
|
publisher = '蘋果日報'
|
||||||
__publisher__ = u'蘋果日報'
|
oldest_article = 1
|
||||||
description = u'蘋果日報'
|
max_articles_per_feed = 100
|
||||||
masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
|
auto_cleanup = False
|
||||||
language = 'zh_TW'
|
language = 'zh'
|
||||||
encoding = 'UTF-8'
|
encoding = 'utf-8'
|
||||||
timefmt = ' [%a, %d %b, %Y]'
|
auto_cleanup = False
|
||||||
needs_subscription = False
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
remove_tags_before = dict(name=['ul', 'h1'])
|
use_embedded_content = False
|
||||||
remove_tags_after = dict(name='form')
|
|
||||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
|
||||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
|
||||||
dict(name=['script', 'noscript', 'style', 'form'])]
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
description = 'http://hkm.appledaily.com/'
|
||||||
@font-face {font-family: "uming", serif, sans-serif; src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
|
category = 'Chinese, News, Hong Kong'
|
||||||
body {margin-right: 8pt; font-family: 'uming', serif;}
|
masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png'
|
||||||
h1 {font-family: 'uming', serif, sans-serif}
|
|
||||||
'''
|
|
||||||
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'
|
||||||
(re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
|
keep_only_tags = [dict(name='div', attrs={'id':'content-article'})]
|
||||||
lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
|
remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}),
|
||||||
]
|
dict(name='p', attrs={'class':'next'})]
|
||||||
|
|
||||||
|
def get_dtlocal(self):
|
||||||
|
dt_utc = datetime.datetime.utcnow()
|
||||||
|
# convert UTC to local hk time - at HKT 6am, all news are available
|
||||||
|
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
|
||||||
|
|
||||||
|
def get_fetchdate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y%m%d")
|
||||||
|
|
||||||
|
def get_fetchformatteddate(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
def get_fetchyear(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[0:4]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%Y")
|
||||||
|
|
||||||
|
def get_fetchmonth(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[4:6]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%m")
|
||||||
|
|
||||||
|
def get_fetchday(self):
|
||||||
|
if __Date__ <> '':
|
||||||
|
return __Date__[6:8]
|
||||||
|
else:
|
||||||
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
|
# Note: does not work with custom date given by __Date__
|
||||||
|
def get_weekday(self):
|
||||||
|
return self.get_dtlocal().weekday()
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
|
soup = self.index_to_soup('http://hkm.appledaily.com/')
|
||||||
|
cover = soup.find(attrs={'class':'top-news'}).get('src', False)
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
#def get_browser(self):
|
try:
|
||||||
#br = BasicNewsRecipe.get_browser(self)
|
br.open(cover)
|
||||||
#if self.username is not None and self.password is not None:
|
except:
|
||||||
# br.open('http://www.nytimes.com/auth/login')
|
cover = None
|
||||||
# br.select_form(name='login')
|
return cover
|
||||||
# br['USERID'] = self.username
|
|
||||||
# br['PASSWORD'] = self.password
|
|
||||||
# br.submit()
|
|
||||||
#return br
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
#process all the images
|
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
|
||||||
iurl = tag['src']
|
|
||||||
#print 'checking image: ' + iurl
|
|
||||||
|
|
||||||
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
|
|
||||||
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
|
|
||||||
|
|
||||||
m = p.search(iurl)
|
|
||||||
|
|
||||||
if m is not None:
|
|
||||||
iurl = 'http://' + m.group('server') + '/' + m.group('path')
|
|
||||||
#print 'working! new url: ' + iurl
|
|
||||||
tag['src'] = iurl
|
|
||||||
#else:
|
|
||||||
#print 'not good'
|
|
||||||
|
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
|
||||||
iurl = tag['href']
|
|
||||||
#print 'checking image: ' + iurl
|
|
||||||
|
|
||||||
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
|
|
||||||
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
|
|
||||||
|
|
||||||
m = p.search(iurl)
|
|
||||||
|
|
||||||
if m is not None:
|
|
||||||
iurl = 'http://' + m.group('server') + '/' + m.group('path')
|
|
||||||
#print 'working! new url: ' + iurl
|
|
||||||
tag['href'] = iurl
|
|
||||||
#else:
|
|
||||||
#print 'not good'
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
picdiv = soup.find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,picdiv['src'])
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
base = 'http://news.hotpot.hk/fruit'
|
feeds = []
|
||||||
soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
|
soup = self.index_to_soup('http://hkm.appledaily.com/')
|
||||||
|
ul = soup.find(attrs={'class':'menu'})
|
||||||
|
sectionList = []
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
a = 'http://hkm.appledaily.com/' + li.find('a', href=True).get('href', False)
|
||||||
|
title = li.find('a', text=True).strip()
|
||||||
|
if not title == u'動新聞':
|
||||||
|
sectionList.append((title, a))
|
||||||
|
for title, url in sectionList:
|
||||||
|
articles = self.parse_section(url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
#def feed_title(div):
|
def parse_section(self, url):
|
||||||
# return ''.join(div.findAll(text=True, recursive=False)).strip()
|
soup = self.index_to_soup(url)
|
||||||
|
ul = soup.find(attrs={'class':'list'})
|
||||||
|
current_articles = []
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
a = li.find('a', href=True)
|
||||||
|
title = li.find('p', text=True).strip()
|
||||||
|
if a is not None:
|
||||||
|
current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)})
|
||||||
|
pass
|
||||||
|
return current_articles
|
||||||
|
|
||||||
articles = {}
|
def create_opf(self, feeds, dir=None):
|
||||||
key = None
|
if dir is None:
|
||||||
ans = []
|
dir = self.output_dir
|
||||||
for div in soup.findAll('li'):
|
title = self.short_title()
|
||||||
key = div.find(text=True, recursive=True);
|
if self.output_profile.periodical_date_in_title:
|
||||||
#if key == u'豪情':
|
title += strftime(self.timefmt)
|
||||||
# continue;
|
mi = MetaInformation(title, [__appname__])
|
||||||
|
mi.publisher = __appname__
|
||||||
|
mi.author_sort = __appname__
|
||||||
|
if self.publication_type:
|
||||||
|
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||||
|
mi.timestamp = nowf()
|
||||||
|
article_titles, aseen = [], set()
|
||||||
|
for f in feeds:
|
||||||
|
for a in f:
|
||||||
|
if a.title and a.title not in aseen:
|
||||||
|
aseen.add(a.title)
|
||||||
|
article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||||
|
|
||||||
print 'section=' + key
|
mi.comments = self.description
|
||||||
|
if not isinstance(mi.comments, unicode):
|
||||||
|
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||||
|
'\n\n'.join(article_titles))
|
||||||
|
|
||||||
articles[key] = []
|
language = canonicalize_lang(self.language)
|
||||||
|
if language is not None:
|
||||||
|
mi.language = language
|
||||||
|
# This one affects the pub date shown in kindle title
|
||||||
|
#mi.pubdate = nowf()
|
||||||
|
# now appears to need the time field to be > 12.00noon as well
|
||||||
|
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||||
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
|
||||||
ans.append(key)
|
opf = OPFCreator(dir, mi)
|
||||||
|
# Add mastheadImage entry to <guide> section
|
||||||
|
mp = getattr(self, 'masthead_path', None)
|
||||||
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
a = div.find('a', href=True)
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
if not a:
|
# Get cover
|
||||||
continue
|
cpath = getattr(self, 'cover_path', None)
|
||||||
|
if cpath is None:
|
||||||
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
|
if self.default_cover(pf):
|
||||||
|
cpath = pf.name
|
||||||
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
|
||||||
url = base + '/' + a['href']
|
# Get masthead
|
||||||
print 'url=' + url
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
|
manifest.append(mpath)
|
||||||
|
|
||||||
if not articles.has_key(key):
|
opf.create_manifest_from_files_in(manifest)
|
||||||
articles[key] = []
|
for mani in opf.manifest:
|
||||||
else:
|
if mani.path.endswith('.ncx'):
|
||||||
# sub page
|
mani.id = 'ncx'
|
||||||
subSoup = self.index_to_soup(url)
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
|
mani.id = 'masthead-image'
|
||||||
|
|
||||||
for subDiv in subSoup.findAll('li'):
|
entries = ['index.html']
|
||||||
subA = subDiv.find('a', href=True)
|
toc = TOC(base_path=dir)
|
||||||
subTitle = subDiv.find(text=True, recursive=True)
|
self.play_order_counter = 0
|
||||||
subUrl = base + '/' + subA['href']
|
self.play_order_map = {}
|
||||||
|
|
||||||
print 'subUrl' + subUrl
|
|
||||||
|
|
||||||
articles[key].append(
|
|
||||||
dict(title=subTitle,
|
|
||||||
url=subUrl,
|
|
||||||
date='',
|
|
||||||
description='',
|
|
||||||
content=''))
|
|
||||||
|
|
||||||
|
|
||||||
# elif div['class'] in ['story', 'story headline']:
|
def feed_index(num, parent):
|
||||||
# a = div.find('a', href=True)
|
f = feeds[num]
|
||||||
# if not a:
|
for j, a in enumerate(f):
|
||||||
# continue
|
if getattr(a, 'downloaded', False):
|
||||||
# url = re.sub(r'\?.*', '', a['href'])
|
adir = 'feed_%d/article_%d/'%(num, j)
|
||||||
# url += '?pagewanted=all'
|
auth = a.author
|
||||||
# title = self.tag_to_string(a, use_alt=True).strip()
|
if not auth:
|
||||||
# description = ''
|
auth = None
|
||||||
# pubdate = strftime('%a, %d %b')
|
desc = a.text_summary
|
||||||
# summary = div.find(True, attrs={'class':'summary'})
|
if not desc:
|
||||||
# if summary:
|
desc = None
|
||||||
# description = self.tag_to_string(summary, use_alt=False)
|
else:
|
||||||
#
|
desc = self.description_limiter(desc)
|
||||||
# feed = key if key is not None else 'Uncategorized'
|
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||||
# if not articles.has_key(feed):
|
entries.append('%sindex.html'%adir)
|
||||||
# articles[feed] = []
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
# if not 'podcasts' in url:
|
if po is None:
|
||||||
# articles[feed].append(
|
self.play_order_counter += 1
|
||||||
# dict(title=title, url=url, date=pubdate,
|
po = self.play_order_counter
|
||||||
# description=description,
|
parent.add_item('%sindex.html'%adir, None,
|
||||||
# content=''))
|
a.title if a.title else _('Untitled Article'),
|
||||||
# ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
play_order=po, author=auth,
|
||||||
ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
|
description=desc, toc_thumbnail=tt)
|
||||||
return ans
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
|
for sp in a.sub_pages:
|
||||||
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
|
relp = sp[len(prefix):]
|
||||||
|
entries.append(relp.replace(os.sep, '/'))
|
||||||
|
last = sp
|
||||||
|
|
||||||
|
if os.path.exists(last):
|
||||||
|
with open(last, 'rb') as fi:
|
||||||
|
src = fi.read().decode('utf-8')
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
body = soup.find('body')
|
||||||
|
if body is not None:
|
||||||
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
|
not self.has_single_feed,
|
||||||
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
|
center=self.center_navbar)
|
||||||
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
|
body.insert(len(body.contents), elem)
|
||||||
|
with open(last, 'wb') as fi:
|
||||||
|
fi.write(unicode(soup).encode('utf-8'))
|
||||||
|
if len(feeds) == 0:
|
||||||
|
raise Exception('All feeds are empty, aborting.')
|
||||||
|
|
||||||
|
if len(feeds) > 1:
|
||||||
|
for i, f in enumerate(feeds):
|
||||||
|
entries.append('feed_%d/index.html'%i)
|
||||||
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
|
if po is None:
|
||||||
|
self.play_order_counter += 1
|
||||||
|
po = self.play_order_counter
|
||||||
|
auth = getattr(f, 'author', None)
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
|
desc = getattr(f, 'description', None)
|
||||||
|
if not desc:
|
||||||
|
desc = None
|
||||||
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
|
else:
|
||||||
|
entries.append('feed_%d/index.html'%0)
|
||||||
|
feed_index(0, toc)
|
||||||
|
|
||||||
|
for i, p in enumerate(entries):
|
||||||
|
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||||
|
opf.create_spine(entries)
|
||||||
|
opf.set_toc(toc)
|
||||||
|
|
||||||
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
54
recipes/arret_sur_images.recipe
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
__license__ = 'WTFPL'
|
||||||
|
__author__ = '2013, François D. <franek at chicour.net>'
|
||||||
|
__description__ = 'Get some fresh news from Arrêt sur images'
|
||||||
|
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Asi(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Arrêt sur images'
|
||||||
|
__author__ = 'François D. (aka franek)'
|
||||||
|
description = 'Global news in french from news site "Arrêt sur images"'
|
||||||
|
|
||||||
|
oldest_article = 7.0
|
||||||
|
language = 'fr'
|
||||||
|
needs_subscription = True
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
simultaneous_downloads = 1
|
||||||
|
timefmt = '[%a, %d %b %Y %I:%M +0200]'
|
||||||
|
cover_url = 'http://www.arretsurimages.net/images/header/menu/menu_1.png'
|
||||||
|
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('vite dit et gratuit', 'http://www.arretsurimages.net/vite-dit.rss'),
|
||||||
|
('Toutes les chroniques', 'http://www.arretsurimages.net/chroniques.rss'),
|
||||||
|
('Contenus et dossiers', 'http://www.arretsurimages.net/dossiers.rss'),
|
||||||
|
]
|
||||||
|
|
||||||
|
conversion_options = { 'smarten_punctuation' : True }
|
||||||
|
|
||||||
|
remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'), dict(name='div', attrs={'class':'bloc-chroniqueur-2'}), dict(id='footercontainer')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('contenu.php', 'contenu-imprimable.php')
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
# Need to use robust HTML parser
|
||||||
|
br = BasicNewsRecipe.get_browser(self, use_robust_parser=True)
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.arretsurimages.net/index.php')
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br.form.set_all_readonly(False)
|
||||||
|
br['redir'] = 'forum/login.php'
|
||||||
|
br['username'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
@ -2,12 +2,12 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Astroflesz(BasicNewsRecipe):
|
class Astroflesz(BasicNewsRecipe):
|
||||||
title = u'Astroflesz'
|
title = u'Astroflesz'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne'
|
description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne'
|
||||||
category = 'astronomy'
|
category = 'astronomy'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://www.astroflesz.pl/templates/astroflesz/images/logo/logo.png'
|
cover_url = 'http://www.astroflesz.pl/templates/astroflesz/images/logo/logo.png'
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -17,7 +17,7 @@ class Astroflesz(BasicNewsRecipe):
|
|||||||
keep_only_tags = [dict(id="k2Container")]
|
keep_only_tags = [dict(id="k2Container")]
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
|
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
|
||||||
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]
|
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]
|
||||||
feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')]
|
feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')]
|
||||||
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
def postprocess_html(self, soup, first_fetch):
|
||||||
t = soup.find(attrs={'class':'itemIntroText'})
|
t = soup.find(attrs={'class':'itemIntroText'})
|
||||||
|
@ -1,17 +1,20 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
class BadaniaNet(BasicNewsRecipe):
|
class BadaniaNet(BasicNewsRecipe):
|
||||||
title = u'badania.net'
|
title = u'badania.net'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'chcesz wiedzieć więcej?'
|
description = u'chcesz wiedzieć więcej?'
|
||||||
category = 'science'
|
category = 'science'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png'
|
cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png'
|
||||||
|
extra_css = '.alignleft {float:left; margin-right:5px;} .alignright {float:right; margin-left:5px;}'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
preprocess_regexps = [(re.compile(r"<h4>Tekst sponsoruje</h4>", re.IGNORECASE), lambda m: ''),]
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})]
|
remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})]
|
||||||
remove_tags_after = dict(attrs={'class':'omc-single-tags'})
|
remove_tags_after = dict(attrs={'class':'omc-single-tags'})
|
||||||
keep_only_tags = [dict(id='omc-full-article')]
|
keep_only_tags = [dict(id='omc-full-article')]
|
||||||
feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')]
|
feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')]
|
@ -9,14 +9,14 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 12
|
max_articles_per_feed = 20
|
||||||
linearize_tables = True
|
linearize_tables = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
|
compress_news_images = True
|
||||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
||||||
|
|
||||||
masthead_url = 'http://www.trinitymirror.com/images/birminghampost-logo.gif'
|
masthead_url = 'http://www.trinitymirror.com/images/birminghampost-logo.gif'
|
||||||
|
@ -37,68 +37,15 @@ class BusinessWeek(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
#remove_tags = [
|
|
||||||
#dict(attrs={'class':'inStory'})
|
|
||||||
#,dict(name=['meta','link','iframe','base','embed','object','table','th','tr','td'])
|
|
||||||
#,dict(attrs={'id':['inset','videoDisplay']})
|
|
||||||
#]
|
|
||||||
#keep_only_tags = [dict(name='div', attrs={'id':['story-body','storyBody']})]
|
|
||||||
remove_attributes = ['lang']
|
|
||||||
match_regexps = [r'http://www.businessweek.com/.*_page_[1-9].*']
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Top Stories', u'http://www.businessweek.com/topStories/rss/topStories.rss'),
|
(u'Top Stories', u'http://www.businessweek.com/feeds/most-popular.rss'),
|
||||||
(u'Top News' , u'http://www.businessweek.com/rss/bwdaily.rss' ),
|
|
||||||
(u'Asia', u'http://www.businessweek.com/rss/asia.rss'),
|
|
||||||
(u'Autos', u'http://www.businessweek.com/rss/autos/index.rss'),
|
|
||||||
(u'Classic Cars', u'http://rss.businessweek.com/bw_rss/classiccars'),
|
|
||||||
(u'Hybrids', u'http://rss.businessweek.com/bw_rss/hybrids'),
|
|
||||||
(u'Europe', u'http://www.businessweek.com/rss/europe.rss'),
|
|
||||||
(u'Auto Reviews', u'http://rss.businessweek.com/bw_rss/autoreviews'),
|
|
||||||
(u'Innovation & Design', u'http://www.businessweek.com/rss/innovate.rss'),
|
|
||||||
(u'Architecture', u'http://www.businessweek.com/rss/architecture.rss'),
|
|
||||||
(u'Brand Equity', u'http://www.businessweek.com/rss/brandequity.rss'),
|
|
||||||
(u'Auto Design', u'http://www.businessweek.com/rss/carbuff.rss'),
|
|
||||||
(u'Game Room', u'http://rss.businessweek.com/bw_rss/gameroom'),
|
|
||||||
(u'Technology', u'http://www.businessweek.com/rss/technology.rss'),
|
|
||||||
(u'Investing', u'http://rss.businessweek.com/bw_rss/investor'),
|
|
||||||
(u'Small Business', u'http://www.businessweek.com/rss/smallbiz.rss'),
|
|
||||||
(u'Careers', u'http://rss.businessweek.com/bw_rss/careers'),
|
|
||||||
(u'B-Schools', u'http://www.businessweek.com/rss/bschools.rss'),
|
|
||||||
(u'Magazine Selections', u'http://www.businessweek.com/rss/magazine.rss'),
|
|
||||||
(u'CEO Guide to Tech', u'http://www.businessweek.com/rss/ceo_guide_tech.rss'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
url = article.get('guid', None)
|
|
||||||
if 'podcasts' in url:
|
|
||||||
return None
|
|
||||||
if 'surveys' in url:
|
|
||||||
return None
|
|
||||||
if 'images' in url:
|
|
||||||
return None
|
|
||||||
if 'feedroom' in url:
|
|
||||||
return None
|
|
||||||
if '/magazine/toc/' in url:
|
|
||||||
return None
|
|
||||||
rurl, sep, rest = url.rpartition('?')
|
|
||||||
if rurl:
|
|
||||||
return rurl
|
|
||||||
return rest
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
if '/news/' in url or '/blog/ in url':
|
soup = self.index_to_soup(url)
|
||||||
return url
|
prntver = soup.find('li', attrs={'class':'print tracked'})
|
||||||
rurl = url.replace('http://www.businessweek.com/','http://www.businessweek.com/print/')
|
rurl = prntver.find('a', href=True)['href']
|
||||||
return rurl.replace('/investing/','/investor/')
|
return rurl
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
for alink in soup.findAll('a'):
|
|
||||||
if alink.string is not None:
|
|
||||||
tstr = alink.string
|
|
||||||
alink.replaceWith(tstr)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
@ -39,7 +40,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
title=self.tag_to_string(div.a).strip()
|
title=self.tag_to_string(div.a).strip()
|
||||||
url=div.a['href']
|
url=div.a['href']
|
||||||
soup0 = self.index_to_soup(url)
|
soup0 = self.index_to_soup(url)
|
||||||
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
|
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
||||||
articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
|
articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})
|
||||||
|
|
||||||
|
|
||||||
@ -56,7 +57,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
title=self.tag_to_string(div.a).strip()
|
title=self.tag_to_string(div.a).strip()
|
||||||
url=div.a['href']
|
url=div.a['href']
|
||||||
soup0 = self.index_to_soup(url)
|
soup0 = self.index_to_soup(url)
|
||||||
urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
|
urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href']
|
||||||
articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
|
articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})
|
||||||
|
|
||||||
if articles:
|
if articles:
|
||||||
|
@ -7,13 +7,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
|||||||
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
description = 'The official website of Countryfile Magazine'
|
description = 'The official website of Countryfile Magazine'
|
||||||
# last updated 8/12/12
|
# last updated 19/10/12
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
oldest_article = 30
|
oldest_article = 30
|
||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
compress_news_images = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
#articles_are_obfuscated = True
|
#articles_are_obfuscated = True
|
||||||
#article_already_exists = False
|
#article_already_exists = False
|
||||||
|
@ -13,9 +13,9 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|||||||
|
|
||||||
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
||||||
|
|
||||||
|
compress_news_images = True
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 1
|
max_articles_per_feed = 12
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
23
recipes/diario_de_noticias.recipe
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# vim:fileencoding=UTF-8
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1365070687(BasicNewsRecipe):
|
||||||
|
title ='Diário de Notícias'
|
||||||
|
oldest_article = 7
|
||||||
|
language = 'pt'
|
||||||
|
__author__ = 'Jose Pinto'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'cln-esqmid'}) ]
|
||||||
|
remove_tags = [ dict(name='table', attrs={'class':'TabFerramentasInf'}) ]
|
||||||
|
|
||||||
|
feeds = [(u'Portugal', u'http://feeds.dn.pt/DN-Portugal'),
|
||||||
|
(u'Globo', u'http://feeds.dn.pt/DN-Globo'),
|
||||||
|
(u'Economia', u'http://feeds.dn.pt/DN-Economia'),
|
||||||
|
(u'Ci\xeancia', u'http://feeds.dn.pt/DN-Ciencia'),
|
||||||
|
(u'Artes', u'http://feeds.dn.pt/DN-Artes'),
|
||||||
|
(u'TV & Media', u'http://feeds.dn.pt/DN-Media'),
|
||||||
|
(u'Opini\xe3o', u'http://feeds.dn.pt/DN-Opiniao'),
|
||||||
|
(u'Pessoas', u'http://feeds.dn.pt/DN-Pessoas')
|
||||||
|
]
|
27
recipes/dzial_zagraniczny.recipe
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
dzialzagraniczny.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class dzial_zagraniczny(BasicNewsRecipe):
|
||||||
|
title = u'Dział Zagraniczny'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Polskiego czytelnika to nie interesuje'
|
||||||
|
INDEX = 'http://dzialzagraniczny.pl'
|
||||||
|
extra_css = 'img {display: block;}'
|
||||||
|
oldest_article = 7
|
||||||
|
cover_url = 'https://fbcdn-profile-a.akamaihd.net/hprofile-ak-prn1/c145.5.160.160/559442_415653975115959_2126205128_n.jpg'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = True
|
||||||
|
|
||||||
|
feeds = [(u'Dział zagraniczny', u'http://feeds.feedburner.com/dyndns/UOfz')]
|
17
recipes/economia.recipe
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1314326622(BasicNewsRecipe):
|
||||||
|
title = u'Economia'
|
||||||
|
__author__ = 'Manish Bhattarai'
|
||||||
|
description = 'Economia - Intelligence & Insight for ICAEW Members'
|
||||||
|
language = 'en_GB'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
masthead_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
|
||||||
|
cover_url = 'http://economia.icaew.com/~/media/Images/Design%20Images/Economia_Red_website.ashx'
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_tags_before = dict(id='content')
|
||||||
|
remove_tags_after = dict(id='stars-wrapper')
|
||||||
|
remove_tags = [dict(attrs={'class':['floatR', 'sharethis', 'rating clearfix']})]
|
||||||
|
feeds = [(u'News', u'http://feedity.com/icaew-com/VlNTVFRa.rss'),(u'Business', u'http://feedity.com/icaew-com/VlNTVFtS.rss'),(u'People', u'http://feedity.com/icaew-com/VlNTVFtX.rss'),(u'Opinion', u'http://feedity.com/icaew-com/VlNTVFtW.rss'),(u'Finance', u'http://feedity.com/icaew-com/VlNTVFtV.rss')]
|
@ -26,7 +26,7 @@ class ElDiplo_Recipe(BasicNewsRecipe):
|
|||||||
title = u'El Diplo'
|
title = u'El Diplo'
|
||||||
__author__ = 'Tomas Di Domenico'
|
__author__ = 'Tomas Di Domenico'
|
||||||
description = 'Publicacion mensual de Le Monde Diplomatique, edicion Argentina'
|
description = 'Publicacion mensual de Le Monde Diplomatique, edicion Argentina'
|
||||||
langauge = 'es_AR'
|
language = 'es_AR'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
|
||||||
|
29
recipes/equipped.recipe
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>, Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
equipped.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class equipped(BasicNewsRecipe):
|
||||||
|
title = u'Equipped'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Wiadomości z equipped.pl'
|
||||||
|
INDEX = 'http://equipped.pl'
|
||||||
|
extra_css = '.alignleft {float:left; margin-right:5px;}'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_empty_feeds = True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
#keep_only_tags = [dict(name='article')]
|
||||||
|
#remove_tags = [dict(id='disqus_thread')]
|
||||||
|
#remove_tags_after = [dict(id='disqus_thread')]
|
||||||
|
|
||||||
|
feeds = [(u'Equipped', u'http://feeds.feedburner.com/Equippedpl?format=xml')]
|
@ -12,12 +12,6 @@ class EsensjaRSS(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
INDEX = 'http://www.esensja.pl'
|
INDEX = 'http://www.esensja.pl'
|
||||||
extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
|
|
||||||
.t-author {font-size: x-small; text-align: left}
|
|
||||||
.t-title2 {font-size: x-small; font-style: italic; text-align: left}
|
|
||||||
.text {font-size: small; text-align: left}
|
|
||||||
.annot-ref {font-style: italic; text-align: left}
|
|
||||||
'''
|
|
||||||
cover_url = ''
|
cover_url = ''
|
||||||
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
@ -1,20 +1,54 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
import re
|
import re
|
||||||
class FilmOrgPl(BasicNewsRecipe):
|
class FilmOrgPl(BasicNewsRecipe):
|
||||||
title = u'Film.org.pl'
|
title = u'Film.org.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce."
|
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce."
|
||||||
category = 'film'
|
category = 'film'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
extra_css = '.alignright {float:right; margin-left:5px;} .alignleft {float:left; margin-right:5px;}'
|
extra_css = '.alignright {float:right; margin-left:5px;} .alignleft {float:left; margin-right:5px;} .recenzja-title {font-size: 150%; margin-top: 5px; margin-bottom: 5px;}'
|
||||||
cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png'
|
cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png'
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
use_embedded_content = True
|
use_embedded_content = False
|
||||||
preprocess_regexps = [(re.compile(ur'<h3>Przeczytaj także:</h3>.*', re.IGNORECASE|re.DOTALL), lambda m: '</body>'), (re.compile(ur'<div>Artykuł</div>', re.IGNORECASE), lambda m: ''), (re.compile(ur'<div>Ludzie filmu</div>', re.IGNORECASE), lambda m: '')]
|
remove_attributes = ['style']
|
||||||
remove_tags = [dict(name='img', attrs={'alt':['Ludzie filmu', u'Artykuł']})]
|
preprocess_regexps = [(re.compile(ur'<h3>Przeczytaj także:</h3>.*', re.IGNORECASE|re.DOTALL), lambda m: '</body>'), (re.compile(ur'</?center>', re.IGNORECASE|re.DOTALL), lambda m: ''), (re.compile(ur'<div>Artykuł</div>', re.IGNORECASE), lambda m: ''), (re.compile(ur'<div>Ludzie filmu</div>', re.IGNORECASE), lambda m: ''), (re.compile(ur'(<br ?/?>\s*?){2,}', re.IGNORECASE|re.DOTALL), lambda m: '')]
|
||||||
feeds = [(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')]
|
keep_only_tags = [dict(name=['h11', 'h16', 'h17']), dict(attrs={'class':'editor'})]
|
||||||
|
remove_tags_after = dict(id='comments')
|
||||||
|
remove_tags = [dict(name=['link', 'meta', 'style']), dict(name='img', attrs={'alt':['Ludzie filmu', u'Artykuł']}), dict(id='comments'), dict(attrs={'style':'border: 0pt none ; margin: 0pt; padding: 0pt;'}), dict(name='p', attrs={'class':'rating'}), dict(attrs={'layout':'button_count'})]
|
||||||
|
feeds = [(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')]
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = soup.find('div', attrs={'class': 'pagelink'})
|
||||||
|
if tag:
|
||||||
|
for nexturl in tag.findAll('a'):
|
||||||
|
url = nexturl['href']
|
||||||
|
soup2 = self.index_to_soup(url)
|
||||||
|
pagetext = soup2.find(attrs={'class': 'editor'})
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
for r in appendtag.findAll(attrs={'class': 'pagelink'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'id': 'comments'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'style':'border: 0pt none ; margin: 0pt; padding: 0pt;'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'layout':'button_count'}):
|
||||||
|
r.extract()
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for c in soup.findAll('h11'):
|
||||||
|
c.name = 'h1'
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
for r in soup.findAll('br'):
|
||||||
|
r.extract()
|
||||||
|
return soup
|
@ -8,6 +8,7 @@ import datetime
|
|||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
class FinancialTimes(BasicNewsRecipe):
|
class FinancialTimes(BasicNewsRecipe):
|
||||||
title = 'Financial Times (UK)'
|
title = 'Financial Times (UK)'
|
||||||
@ -93,7 +94,7 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
try:
|
try:
|
||||||
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
|
urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
title = self.tag_to_string(item)
|
title = self.tag_to_string(item)
|
||||||
date = strftime(self.timefmt)
|
date = strftime(self.timefmt)
|
||||||
articles.append({
|
articles.append({
|
||||||
@ -105,29 +106,30 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
return articles
|
return articles
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = OrderedDict()
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
#dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div'))
|
||||||
self.timefmt = ' [%s]'%dates
|
#self.timefmt = ' [%s]'%dates
|
||||||
wide = soup.find('div',attrs={'class':'wide'})
|
section_title = 'Untitled'
|
||||||
if not wide:
|
|
||||||
return feeds
|
for column in soup.findAll('div', attrs = {'class':'feedBoxes clearfix'}):
|
||||||
allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()})
|
for section in column. findAll('div', attrs = {'class':'feedBox'}):
|
||||||
if not allsections:
|
sectiontitle=self.tag_to_string(section.find('h4'))
|
||||||
return feeds
|
if '...' not in sectiontitle: section_title=sectiontitle
|
||||||
count = 0
|
for article in section.ul.findAll('li'):
|
||||||
for item in allsections:
|
articles = []
|
||||||
count = count + 1
|
title=self.tag_to_string(article.a)
|
||||||
if self.test and count > 2:
|
url=article.a['href']
|
||||||
return feeds
|
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
|
||||||
fitem = item.h3
|
|
||||||
if not fitem:
|
if articles:
|
||||||
fitem = item.h4
|
if section_title not in feeds:
|
||||||
ftitle = self.tag_to_string(fitem)
|
feeds[section_title] = []
|
||||||
self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
|
feeds[section_title] += articles
|
||||||
feedarts = self.get_artlinks(item.ul)
|
|
||||||
feeds.append((ftitle,feedarts))
|
|
||||||
return feeds
|
ans = [(key, val) for key, val in feeds.iteritems()]
|
||||||
|
return ans
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
items = ['promo-box','promo-title',
|
items = ['promo-box','promo-title',
|
||||||
@ -174,9 +176,6 @@ class FinancialTimes(BasicNewsRecipe):
|
|||||||
count += 1
|
count += 1
|
||||||
tfile = PersistentTemporaryFile('_fa.html')
|
tfile = PersistentTemporaryFile('_fa.html')
|
||||||
tfile.write(html)
|
tfile.write(html)
|
||||||
tfile.close()
|
tfile.close()
|
||||||
self.temp_files.append(tfile)
|
self.temp_files.append(tfile)
|
||||||
return tfile.name
|
return tfile.name
|
||||||
|
|
||||||
def cleanup(self):
|
|
||||||
self.browser.open('https://registration.ft.com/registration/login/logout?location=')
|
|
@ -1,12 +1,12 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
class FocusRecipe(BasicNewsRecipe):
|
class FocusRecipe(BasicNewsRecipe):
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
__author__ = u'intromatyk <intromatyk@gmail.com>'
|
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
version = 1
|
version = 1
|
||||||
|
|
||||||
|
53
recipes/forbes_pl.recipe
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
|
||||||
|
class forbes_pl(BasicNewsRecipe):
|
||||||
|
title = u'Forbes.pl'
|
||||||
|
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Biznes, finanse, gospodarka, strategie, wiadomości gospodarcze, analizy finasowe i strategiczne.'
|
||||||
|
oldest_article = 1
|
||||||
|
index = 'http://www.forbes.pl'
|
||||||
|
cover_url = 'http://www.forbes.pl/resources/front/images/logo.png'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
extra_css = '.Block-Photo {float:left; max-width: 300px; margin-right: 5px;}'
|
||||||
|
preprocess_regexps = [(re.compile(ur'<p>(<strong>)?(Czytaj|Zobacz) (też|także):.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<strong>Zobacz:.*?</strong>', re.DOTALL), lambda match: '')]
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
yesterday = now - datetime.timedelta(hours=24)
|
||||||
|
yesterday = yesterday.strftime("%d.%m.%Y %H:%M:%S")
|
||||||
|
pages_count = 4
|
||||||
|
keep_only_tags = [dict(attrs={'class':['Block-Node Content-Article ', 'Block-Node Content-Article piano-closed']})]
|
||||||
|
remove_tags = [dict(attrs={'class':['Keywords Styled', 'twitter-share-button', 'Block-List-Related Block-List']})]
|
||||||
|
|
||||||
|
feeds = [(u'Wszystkie', 'http://www.forbes.pl/rss')]
|
||||||
|
|
||||||
|
'''def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
cleanup = False
|
||||||
|
nexturl = appendtag.find('a', attrs={'class':'next'})
|
||||||
|
if nexturl:
|
||||||
|
cleanup = True
|
||||||
|
while nexturl:
|
||||||
|
soup2 = self.index_to_soup(self.index + nexturl['href'])
|
||||||
|
nexturl = soup2.find('a', attrs={'class':'next'})
|
||||||
|
pagetext = soup2.findAll(id='article-body-wrapper')
|
||||||
|
if not pagetext:
|
||||||
|
pagetext = soup2.findAll(attrs={'class':'Article-Entry Styled'})
|
||||||
|
for comment in pagetext.findAll(text=lambda text:isinstance(text, Comment)):
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
if cleanup:
|
||||||
|
for r in appendtag.findAll(attrs={'class':'paginator'}):
|
||||||
|
r.extract()'''
|
@ -6,6 +6,7 @@ __copyright__ = u'2010-2013, Tomasz Dlugosz <tomek3d@gmail.com>'
|
|||||||
fronda.pl
|
fronda.pl
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from datetime import timedelta, date
|
from datetime import timedelta, date
|
||||||
|
|
||||||
@ -23,6 +24,7 @@ class Fronda(BasicNewsRecipe):
|
|||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1 {font-size:150%}
|
h1 {font-size:150%}
|
||||||
.body {text-align:left;}
|
.body {text-align:left;}
|
||||||
|
div#featured-image {font-style:italic; font-size:70%}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||||
@ -55,7 +57,10 @@ class Fronda(BasicNewsRecipe):
|
|||||||
articles = {}
|
articles = {}
|
||||||
|
|
||||||
for url, genName in genres:
|
for url, genName in genres:
|
||||||
soup = self.index_to_soup('http://www.fronda.pl/c/'+ url)
|
try:
|
||||||
|
soup = self.index_to_soup('http://www.fronda.pl/c/'+ url)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
articles[genName] = []
|
articles[genName] = []
|
||||||
for item in soup.findAll('li'):
|
for item in soup.findAll('li'):
|
||||||
article_h = item.find('h2')
|
article_h = item.find('h2')
|
||||||
@ -77,16 +82,15 @@ class Fronda(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'class':['related-articles',
|
dict(name='div', attrs={'class':['related-articles','button right','pagination','related-articles content']}),
|
||||||
'button right',
|
|
||||||
'pagination']}),
|
|
||||||
dict(name='h3', attrs={'class':'block-header article comments'}),
|
dict(name='h3', attrs={'class':'block-header article comments'}),
|
||||||
dict(name='ul', attrs={'class':'comment-list'}),
|
dict(name='ul', attrs={'class':['comment-list','category','tag-list']}),
|
||||||
dict(name='ul', attrs={'class':'category'}),
|
|
||||||
dict(name='ul', attrs={'class':'tag-list'}),
|
|
||||||
dict(name='p', attrs={'id':'comments-disclaimer'}),
|
dict(name='p', attrs={'id':'comments-disclaimer'}),
|
||||||
dict(name='div', attrs={'style':'text-align: left; margin-bottom: 15px;'}),
|
dict(name='div', attrs={'style':'text-align: left; margin-bottom: 15px;'}),
|
||||||
dict(name='div', attrs={'style':'text-align: left; margin-top: 15px; margin-bottom: 30px;'}),
|
dict(name='div', attrs={'style':'text-align: left; margin-top: 15px; margin-bottom: 30px;'}),
|
||||||
dict(name='div', attrs={'class':'related-articles content'}),
|
dict(name='div', attrs={'id':'comment-form'}),
|
||||||
dict(name='div', attrs={'id':'comment-form'})
|
dict(name='span', attrs={'class':'separator'})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'komentarzy: .*?</h6>', re.IGNORECASE | re.DOTALL | re.M ), lambda match: '</h6>')]
|
||||||
|
108
recipes/galaxys_edge.recipe
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
from __future__ import with_statement
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class GalaxyEdge(BasicNewsRecipe):
|
||||||
|
title = u'The Galaxy\'s Edge'
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
oldest_article = 7
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
#keep_only_tags = [dict(id='content')]
|
||||||
|
#remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
|
||||||
|
#dict(id=['email-section', 'right-column', 'printfooter', 'topover',
|
||||||
|
#'slidebox', 'th_footer'])]
|
||||||
|
|
||||||
|
extra_css = '.photo-caption { font-size: smaller }'
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('http://www.galaxysedge.com/')
|
||||||
|
main = soup.find('table', attrs={'width':'911'})
|
||||||
|
toc = main.find('td', attrs={'width':'225'})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
current_section = None
|
||||||
|
current_articles = []
|
||||||
|
feeds = []
|
||||||
|
c = 0
|
||||||
|
for x in toc.findAll(['p']):
|
||||||
|
c = c+1
|
||||||
|
if c == 5:
|
||||||
|
if current_articles and current_section:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
edwo = x.find('a')
|
||||||
|
current_section = self.tag_to_string(edwo)
|
||||||
|
current_articles = []
|
||||||
|
self.log('\tFound section:', current_section)
|
||||||
|
title = self.tag_to_string(edwo)
|
||||||
|
url = edwo.get('href', True)
|
||||||
|
url = 'http://www.galaxysedge.com/'+url
|
||||||
|
print(title)
|
||||||
|
print(c)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
current_articles.append({'title': title, 'url':url,
|
||||||
|
'description':'', 'date':''})
|
||||||
|
elif c>5:
|
||||||
|
current_section = self.tag_to_string(x.find('b'))
|
||||||
|
current_articles = []
|
||||||
|
self.log('\tFound section:', current_section)
|
||||||
|
for y in x.findAll('a'):
|
||||||
|
title = self.tag_to_string(y)
|
||||||
|
url = y.get('href', True)
|
||||||
|
url = 'http://www.galaxysedge.com/'+url
|
||||||
|
print(title)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
current_articles.append({'title': title, 'url':url,
|
||||||
|
'description':'', 'date':''})
|
||||||
|
if current_articles and current_section:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#def preprocess_raw_html(self, raw, url):
|
||||||
|
#return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
|
||||||
|
|
||||||
|
#def postprocess_html(self, soup, first_fetch):
|
||||||
|
#for t in soup.findAll(['table', 'tr', 'td','center']):
|
||||||
|
#t.name = 'div'
|
||||||
|
#return soup
|
||||||
|
|
||||||
|
#def parse_index(self):
|
||||||
|
#today = time.strftime('%Y-%m-%d')
|
||||||
|
#soup = self.index_to_soup(
|
||||||
|
#'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
|
||||||
|
#div = soup.find(id='left-column')
|
||||||
|
#feeds = []
|
||||||
|
#current_section = None
|
||||||
|
#current_articles = []
|
||||||
|
#for x in div.findAll(['h3', 'div']):
|
||||||
|
#if current_section and x.get('class', '') == 'tpaper':
|
||||||
|
#a = x.find('a', href=True)
|
||||||
|
#if a is not None:
|
||||||
|
#current_articles.append({'url':a['href']+'?css=print',
|
||||||
|
#'title':self.tag_to_string(a), 'date': '',
|
||||||
|
#'description':''})
|
||||||
|
#if x.name == 'h3':
|
||||||
|
#if current_section and current_articles:
|
||||||
|
#feeds.append((current_section, current_articles))
|
||||||
|
#current_section = self.tag_to_string(x)
|
||||||
|
#current_articles = []
|
||||||
|
#return feeds
|
||||||
|
|
||||||
|
|
@ -14,13 +14,14 @@ class gazetaprawna(BasicNewsRecipe):
|
|||||||
title = u'Gazeta Prawna'
|
title = u'Gazeta Prawna'
|
||||||
__author__ = u'Vroo'
|
__author__ = u'Vroo'
|
||||||
publisher = u'Infor Biznes'
|
publisher = u'Infor Biznes'
|
||||||
oldest_article = 7
|
oldest_article = 1
|
||||||
max_articles_per_feed = 20
|
max_articles_per_feed = 20
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
description = 'Polski dziennik gospodarczy'
|
description = 'Polski dziennik gospodarczy'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
remove_tags_after = [
|
remove_tags_after = [
|
||||||
dict(name='div', attrs={'class':['data-art']})
|
dict(name='div', attrs={'class':['data-art']})
|
||||||
@ -30,7 +31,7 @@ class gazetaprawna(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Wiadomo\u015bci - najwa\u017cniejsze', u'http://www.gazetaprawna.pl/wiadomosci/najwazniejsze/rss.xml'),
|
(u'Z ostatniej chwili', u'http://rss.gazetaprawna.pl/GazetaPrawna'),
|
||||||
(u'Biznes i prawo gospodarcze', u'http://biznes.gazetaprawna.pl/rss.xml'),
|
(u'Biznes i prawo gospodarcze', u'http://biznes.gazetaprawna.pl/rss.xml'),
|
||||||
(u'Prawo i wymiar sprawiedliwo\u015bci', u'http://prawo.gazetaprawna.pl/rss.xml'),
|
(u'Prawo i wymiar sprawiedliwo\u015bci', u'http://prawo.gazetaprawna.pl/rss.xml'),
|
||||||
(u'Praca i ubezpieczenia', u'http://praca.gazetaprawna.pl/rss.xml'),
|
(u'Praca i ubezpieczenia', u'http://praca.gazetaprawna.pl/rss.xml'),
|
||||||
@ -51,3 +52,8 @@ class gazetaprawna(BasicNewsRecipe):
|
|||||||
url = url.replace('prawo.gazetaprawna', 'www.gazetaprawna')
|
url = url.replace('prawo.gazetaprawna', 'www.gazetaprawna')
|
||||||
url = url.replace('praca.gazetaprawna', 'www.gazetaprawna')
|
url = url.replace('praca.gazetaprawna', 'www.gazetaprawna')
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.egazety.pl/infor/e-wydanie-dziennik-gazeta-prawna.html')
|
||||||
|
self.cover_url = soup.find('p', attrs={'class':'covr'}).a['href']
|
||||||
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
@ -10,7 +10,7 @@ krakow.gazeta.pl
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class gw_krakow(BasicNewsRecipe):
|
class gw_krakow(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Kraków'
|
title = u'Gazeta Wyborcza Kraków'
|
||||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
|
||||||
|
@ -5,7 +5,7 @@ import string
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class GazetaPlSzczecin(BasicNewsRecipe):
|
class GazetaPlSzczecin(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Szczecin'
|
title = u'Gazeta Wyborcza Szczecin'
|
||||||
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
||||||
__author__ = u'Michał Szkutnik'
|
__author__ = u'Michał Szkutnik'
|
||||||
__license__ = u'GPL v3'
|
__license__ = u'GPL v3'
|
||||||
|
@ -10,7 +10,7 @@ warszawa.gazeta.pl
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class gw_wawa(BasicNewsRecipe):
|
class gw_wawa(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl Warszawa'
|
title = u'Gazeta Wyborcza Warszawa'
|
||||||
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
|
description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
|
||||||
|
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
from calibre.ebooks.BeautifulSoup import Comment
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl'
|
title = u'Gazeta Wyborcza'
|
||||||
__author__ = 'fenuks, Artur Stachecki'
|
__author__ = 'fenuks, Artur Stachecki'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
|
||||||
|
@ -9,7 +9,7 @@ gofin.pl
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class gofin(AutomaticNewsRecipe):
|
class gofin(BasicNewsRecipe):
|
||||||
title = u'Gofin'
|
title = u'Gofin'
|
||||||
__author__ = 'teepel <teepel44@gmail.com>'
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
|
@ -2,22 +2,22 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
class Gram_pl(BasicNewsRecipe):
|
class Gram_pl(BasicNewsRecipe):
|
||||||
title = u'Gram.pl'
|
title = u'Gram.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Serwis społecznościowy o grach: recenzje, newsy, zapowiedzi, encyklopedia gier, forum. Gry PC, PS3, X360, PS Vita, sprzęt dla graczy.'
|
description = u'Serwis społecznościowy o grach: recenzje, newsy, zapowiedzi, encyklopedia gier, forum. Gry PC, PS3, X360, PS Vita, sprzęt dla graczy.'
|
||||||
category = 'games'
|
category = 'games'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
index='http://www.gram.pl'
|
index='http://www.gram.pl'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
||||||
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
||||||
keep_only_tags= [dict(id='articleModule')]
|
keep_only_tags= [dict(id='articleModule')]
|
||||||
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']})]
|
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']}), dict(name='aside')]
|
||||||
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
||||||
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')
|
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -46,4 +46,4 @@ class Gram_pl(BasicNewsRecipe):
|
|||||||
tag=soup.find(name='span', attrs={'class':'platforma'})
|
tag=soup.find(name='span', attrs={'class':'platforma'})
|
||||||
if tag:
|
if tag:
|
||||||
tag.name = 'p'
|
tag.name = 'p'
|
||||||
return soup
|
return soup
|
@ -1,5 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
harpers.org - paid subscription/ printed issue articles
|
harpers.org - paid subscription/ printed issue articles
|
||||||
This recipe only get's article's published in text format
|
This recipe only get's article's published in text format
|
||||||
@ -14,7 +14,7 @@ from calibre import strftime
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Harpers_full(BasicNewsRecipe):
|
class Harpers_full(BasicNewsRecipe):
|
||||||
title = "Harper's Magazine - Printed Edition"
|
title = "Harper's Magazine - articles from printed edition"
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index."
|
description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index."
|
||||||
publisher = "Harpers's"
|
publisher = "Harpers's"
|
||||||
@ -29,7 +29,6 @@ class Harpers_full(BasicNewsRecipe):
|
|||||||
needs_subscription = 'optional'
|
needs_subscription = 'optional'
|
||||||
masthead_url = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif'
|
masthead_url = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
INDEX = ''
|
|
||||||
LOGIN = 'http://harpers.org/wp-content/themes/harpers/ajax_login.php'
|
LOGIN = 'http://harpers.org/wp-content/themes/harpers/ajax_login.php'
|
||||||
extra_css = """
|
extra_css = """
|
||||||
body{font-family: adobe-caslon-pro,serif}
|
body{font-family: adobe-caslon-pro,serif}
|
||||||
@ -66,43 +65,42 @@ class Harpers_full(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
#find current issue
|
#find current issue
|
||||||
|
|
||||||
soup = self.index_to_soup('http://harpers.org/')
|
soup = self.index_to_soup('http://harpers.org/')
|
||||||
currentIssue=soup.find('div',attrs={'class':'mainNavi'}).find('li',attrs={'class':'curentIssue'})
|
currentIssue=soup.find('div',attrs={'class':'mainNavi'}).find('li',attrs={'class':'curentIssue'})
|
||||||
currentIssue_url=self.tag_to_string(currentIssue.a['href'])
|
currentIssue_url=self.tag_to_string(currentIssue.a['href'])
|
||||||
|
self.log(currentIssue_url)
|
||||||
|
|
||||||
#go to the current issue
|
#go to the current issue
|
||||||
soup1 = self.index_to_soup(currentIssue_url)
|
soup1 = self.index_to_soup(currentIssue_url)
|
||||||
date = re.split('\s\|\s',self.tag_to_string(soup1.head.title.string))[0]
|
currentIssue_title = self.tag_to_string(soup1.head.title.string)
|
||||||
|
date = re.split('\s\|\s',currentIssue_title)[0]
|
||||||
self.timefmt = u' [%s]'%date
|
self.timefmt = u' [%s]'%date
|
||||||
|
|
||||||
#get cover
|
#get cover
|
||||||
self.cover_url = soup1.find('div', attrs = {'class':'picture_hp'}).find('img', src=True)['src']
|
self.cover_url = soup1.find('div', attrs = {'class':'picture_hp'}).find('img', src=True)['src']
|
||||||
|
self.log(self.cover_url)
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
count = 0
|
count = 0
|
||||||
for item in soup1.findAll('div', attrs={'class':'articleData'}):
|
for item in soup1.findAll('div', attrs={'class':'articleData'}):
|
||||||
text_links = item.findAll('h2')
|
text_links = item.findAll('h2')
|
||||||
for text_link in text_links:
|
if text_links:
|
||||||
if count == 0:
|
for text_link in text_links:
|
||||||
count = 1
|
if count == 0:
|
||||||
else:
|
count = 1
|
||||||
url = text_link.a['href']
|
else:
|
||||||
title = text_link.a.contents[0]
|
url = text_link.a['href']
|
||||||
date = strftime(' %B %Y')
|
title = self.tag_to_string(text_link.a)
|
||||||
articles.append({
|
date = strftime(' %B %Y')
|
||||||
'title' :title
|
articles.append({
|
||||||
,'date' :date
|
'title' :title
|
||||||
,'url' :url
|
,'date' :date
|
||||||
,'description':''
|
,'url' :url
|
||||||
})
|
,'description':''
|
||||||
return [(soup1.head.title.string, articles)]
|
})
|
||||||
|
return [(currentIssue_title, articles)]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url + '?single=1'
|
return url + '?single=1'
|
||||||
|
|
||||||
def cleanup(self):
|
|
||||||
soup = self.index_to_soup('http://harpers.org/')
|
|
||||||
signouturl=self.tag_to_string(soup.find('li', attrs={'class':'subLogOut'}).findNext('li').a['href'])
|
|
||||||
self.log(signouturl)
|
|
||||||
self.browser.open(signouturl)
|
|
||||||
|
@ -1,6 +1,4 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
|
||||||
from datetime import date, timedelta
|
|
||||||
|
|
||||||
class HBR(BasicNewsRecipe):
|
class HBR(BasicNewsRecipe):
|
||||||
|
|
||||||
@ -11,23 +9,18 @@ class HBR(BasicNewsRecipe):
|
|||||||
timefmt = ' [%B %Y]'
|
timefmt = ' [%B %Y]'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
# recipe_disabled = ('hbr.org has started requiring the use of javascript'
|
|
||||||
# ' to log into their website. This is unsupported in calibre, so'
|
|
||||||
# ' this recipe has been disabled. If you would like to see '
|
|
||||||
# ' HBR supported in calibre, contact hbr.org and ask them'
|
|
||||||
# ' to provide a javascript free login method.')
|
|
||||||
|
|
||||||
LOGIN_URL = 'https://hbr.org/login?request_url=/'
|
LOGIN_URL = 'https://hbr.org/login?request_url=/'
|
||||||
LOGOUT_URL = 'https://hbr.org/logout?request_url=/'
|
LOGOUT_URL = 'https://hbr.org/logout?request_url=/'
|
||||||
|
|
||||||
INDEX = 'http://hbr.org/archive-toc/BR'
|
INDEX = 'http://hbr.org'
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', id='pageContainer')]
|
keep_only_tags = [dict(name='div', id='pageContainer')]
|
||||||
remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
|
remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
|
||||||
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
|
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
|
||||||
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
|
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
|
||||||
'mailingListTout', 'partnerCenter', 'pageFooter',
|
'mailingListTout', 'partnerCenter', 'pageFooter',
|
||||||
'superNavHeadContainer', 'hbrDisqus',
|
'superNavHeadContainer', 'hbrDisqus', 'article-toolbox',
|
||||||
'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
|
'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
|
||||||
dict(name='iframe')]
|
dict(name='iframe')]
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
@ -57,22 +50,6 @@ class HBR(BasicNewsRecipe):
|
|||||||
if url.endswith('/ar/1'):
|
if url.endswith('/ar/1'):
|
||||||
return url[:-1]+'pr'
|
return url[:-1]+'pr'
|
||||||
|
|
||||||
def hbr_get_toc(self):
|
|
||||||
# return self.index_to_soup(open('/t/toc.html').read())
|
|
||||||
|
|
||||||
today = date.today()
|
|
||||||
future = today + timedelta(days=30)
|
|
||||||
past = today - timedelta(days=30)
|
|
||||||
for x in [x.strftime('%y%m') for x in (future, today, past)]:
|
|
||||||
url = self.INDEX + x
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
if (not soup.find(text='Issue Not Found') and not soup.find(
|
|
||||||
text="We're Sorry. There was an error processing your request")
|
|
||||||
and 'Exception: java.io.FileNotFoundException' not in
|
|
||||||
unicode(soup)):
|
|
||||||
return soup
|
|
||||||
raise Exception('Could not find current issue')
|
|
||||||
|
|
||||||
def hbr_parse_toc(self, soup):
|
def hbr_parse_toc(self, soup):
|
||||||
feeds = []
|
feeds = []
|
||||||
current_section = None
|
current_section = None
|
||||||
@ -105,23 +82,19 @@ class HBR(BasicNewsRecipe):
|
|||||||
|
|
||||||
articles.append({'title':title, 'url':url, 'description':desc,
|
articles.append({'title':title, 'url':url, 'description':desc,
|
||||||
'date':''})
|
'date':''})
|
||||||
|
|
||||||
|
if current_section is not None and articles:
|
||||||
|
feeds.append((current_section, articles))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.hbr_get_toc()
|
soup0 = self.index_to_soup('http://hbr.org/magazine')
|
||||||
# open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8'))
|
datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1]
|
||||||
|
#find date & cover
|
||||||
|
self.cover_url=datencover.img['src']
|
||||||
|
dates=self.tag_to_string(datencover.img['alt'])
|
||||||
|
self.timefmt = u' [%s]'%dates
|
||||||
|
soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href'])
|
||||||
feeds = self.hbr_parse_toc(soup)
|
feeds = self.hbr_parse_toc(soup)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
cover_url = None
|
|
||||||
index = 'http://hbr.org/current'
|
|
||||||
soup = self.index_to_soup(index)
|
|
||||||
link_item = soup.find('img', alt=re.compile("Current Issue"), src=True)
|
|
||||||
|
|
||||||
if link_item:
|
|
||||||
cover_url = 'http://hbr.org' + link_item['src']
|
|
||||||
|
|
||||||
return cover_url
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,27 +1,22 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Historia_org_pl(BasicNewsRecipe):
|
class Historia_org_pl(BasicNewsRecipe):
|
||||||
title = u'Historia.org.pl'
|
title = u'Historia.org.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Artykuły dotyczące historii w układzie epok i tematów, forum. Najlepsza strona historii. Matura z historii i egzamin gimnazjalny z historii.'
|
description = u'Artykuły dotyczące historii w układzie epok i tematów, forum. Najlepsza strona historii. Matura z historii i egzamin gimnazjalny z historii.'
|
||||||
cover_url = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg'
|
cover_url = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg'
|
||||||
category = 'history'
|
category = 'history'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
|
extra_css = 'img {float: left; margin-right: 10px;} .alignleft {float: left; margin-right: 10px;}'
|
||||||
remove_empty_feeds= True
|
remove_empty_feeds= True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = True
|
use_embedded_content = True
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'),
|
||||||
|
(u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'),
|
||||||
feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'),
|
(u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'),
|
||||||
(u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'),
|
(u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'),
|
||||||
(u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'),
|
(u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'),
|
||||||
(u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'),
|
(u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),]
|
||||||
(u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'),
|
|
||||||
(u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),]
|
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url + '?tmpl=component&print=1&layout=default&page='
|
|
BIN
recipes/icons/dzial_zagraniczny.png
Normal file
After Width: | Height: | Size: 491 B |
BIN
recipes/icons/equipped.png
Normal file
After Width: | Height: | Size: 929 B |
BIN
recipes/icons/forbes_pl.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
BIN
recipes/icons/gazeta-prawna-calibre-v1.png
Normal file
After Width: | Height: | Size: 612 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
Before Width: | Height: | Size: 802 B After Width: | Height: | Size: 294 B |
BIN
recipes/icons/ittechblog.png
Normal file
After Width: | Height: | Size: 731 B |
BIN
recipes/icons/magazyn_consido.png
Normal file
After Width: | Height: | Size: 982 B |
BIN
recipes/icons/media2.png
Normal file
After Width: | Height: | Size: 660 B |
BIN
recipes/icons/mobilna.png
Normal file
After Width: | Height: | Size: 885 B |
BIN
recipes/icons/mojegotowanie.png
Normal file
After Width: | Height: | Size: 307 B |
BIN
recipes/icons/najwyzszy_czas.png
Normal file
After Width: | Height: | Size: 616 B |
BIN
recipes/icons/newsweek_polska.png
Normal file
After Width: | Height: | Size: 905 B |
BIN
recipes/icons/nowiny_rybnik.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
BIN
recipes/icons/osw.png
Normal file
After Width: | Height: | Size: 489 B |
BIN
recipes/icons/ppe_pl.png
Normal file
After Width: | Height: | Size: 3.1 KiB |
BIN
recipes/icons/presseurop.png
Normal file
After Width: | Height: | Size: 207 B |
BIN
recipes/icons/res_publica.png
Normal file
After Width: | Height: | Size: 733 B |
BIN
recipes/icons/slashdot.png
Normal file
After Width: | Height: | Size: 250 B |
BIN
recipes/icons/sport_pl.png
Normal file
After Width: | Height: | Size: 627 B |
BIN
recipes/icons/sportowefakty.png
Normal file
After Width: | Height: | Size: 511 B |
BIN
recipes/icons/wolne_media.png
Normal file
After Width: | Height: | Size: 497 B |
BIN
recipes/icons/wysokie_obcasy.png
Normal file
After Width: | Height: | Size: 205 B |
@ -1,21 +1,20 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class INFRA(BasicNewsRecipe):
|
class INFRA(BasicNewsRecipe):
|
||||||
title = u'INFRA'
|
title = u'INFRA'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
|
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
|
||||||
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
|
cover_url = 'http://i.imgur.com/j7hJT.jpg'
|
||||||
category = 'UFO'
|
category = 'UFO'
|
||||||
index='http://infra.org.pl'
|
index='http://infra.org.pl'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheers=True
|
remove_attrs = ['style']
|
||||||
remove_tags_before=dict(name='h2', attrs={'class':'contentheading'})
|
no_stylesheets = True
|
||||||
remove_tags_after=dict(attrs={'class':'pagenav'})
|
keep_only_tags = [dict(id='ja-current-content')]
|
||||||
remove_tags=[dict(attrs={'class':'pagenav'})]
|
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/rss')]
|
||||||
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/rss')]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
@ -23,4 +22,4 @@ class INFRA(BasicNewsRecipe):
|
|||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href']=self.index + a['href']
|
a['href']=self.index + a['href']
|
||||||
return soup
|
return soup
|
26
recipes/ittechblog.recipe
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = 'MrStefan'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.ittechblog.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class ittechblog(BasicNewsRecipe):
|
||||||
|
title = u'IT techblog'
|
||||||
|
__author__ = 'MrStefan <mrstefaan@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'Na naszym blogu technologicznym znajdziesz między innymi: testy sprzętu, najnowsze startupy, technologiczne nowinki, felietony tematyczne.'
|
||||||
|
extra_css = '.cover > img {display:block;}'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
keep_only_tags =[dict(attrs={'class':'box'})]
|
||||||
|
remove_tags =[dict(name='aside'), dict(attrs={'class':['tags', 'counter', 'twitter-share-button']})]
|
||||||
|
|
||||||
|
feeds = [(u'Artykuły', u'http://feeds.feedburner.com/ITTechBlog?format=xml')]
|
@ -1,14 +1,16 @@
|
|||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class KDEFamilyPl(BasicNewsRecipe):
|
class KDEFamilyPl(BasicNewsRecipe):
|
||||||
title = u'KDEFamily.pl'
|
title = u'KDEFamily.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'KDE w Polsce'
|
description = u'KDE w Polsce'
|
||||||
category = 'open source, KDE'
|
category = 'open source, KDE'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://www.mykde.home.pl/kdefamily/wp-content/uploads/2012/07/logotype-e1341585198616.jpg'
|
cover_url = 'http://www.mykde.home.pl/kdefamily/wp-content/uploads/2012/07/logotype-e1341585198616.jpg'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
preprocess_regexps = [(re.compile(r"Podobne wpisy.*", re.IGNORECASE|re.DOTALL), lambda m: '')]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = True
|
use_embedded_content = True
|
||||||
feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')]
|
feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')]
|
@ -3,10 +3,10 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
class Konflikty(BasicNewsRecipe):
|
class Konflikty(BasicNewsRecipe):
|
||||||
title = u'Konflikty Zbrojne'
|
title = u'Konflikty Zbrojne'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg'
|
cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
description = u'Zbiór ciekawych artykułów historycznych, militarnych oraz recenzji książek, gier i filmów. Najświeższe informacje o lotnictwie, wojskach lądowych i polityce.'
|
description = u'Zbiór ciekawych artykułów historycznych, militarnych oraz recenzji książek, gier i filmów. Najświeższe informacje o lotnictwie, wojskach lądowych i polityce.'
|
||||||
category='military, history'
|
category='military, history'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -14,19 +14,20 @@ class Konflikty(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
|
keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
|
||||||
|
|
||||||
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
|
feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
|
||||||
(u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
|
(u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
|
||||||
(u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
|
(u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
|
||||||
(u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),
|
(u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),
|
||||||
(u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'),
|
(u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'),
|
||||||
(u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')]
|
(u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
for image in soup.findAll(name='a', attrs={'class':'image'}):
|
for image in soup.findAll(name='a', attrs={'class':'image'}):
|
||||||
|
image['style'] = 'width: 210px; float: left; margin-right:5px;'
|
||||||
if image.img and image.img.has_key('alt'):
|
if image.img and image.img.has_key('alt'):
|
||||||
image.name='div'
|
image.name='div'
|
||||||
pos = len(image.contents)
|
pos = len(image.contents)
|
||||||
image.insert(pos, BeautifulSoup('<p style="font-style:italic;">'+image.img['alt']+'</p>'))
|
image.insert(pos, BeautifulSoup('<p style="font-style:italic;">'+image.img['alt']+'</p>'))
|
||||||
return soup
|
return soup
|
@ -2,12 +2,13 @@
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
class Kosmonauta(BasicNewsRecipe):
|
class Kosmonauta(BasicNewsRecipe):
|
||||||
title = u'Kosmonauta.net'
|
title = u'Kosmonauta.net'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
|
description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
|
||||||
category = 'astronomy'
|
category = 'astronomy'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
|
cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
|
||||||
|
extra_css = '.thumbnail {float:left;margin-right:5px;}'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
INDEX = 'http://www.kosmonauta.net'
|
INDEX = 'http://www.kosmonauta.net'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -16,9 +17,12 @@ class Kosmonauta(BasicNewsRecipe):
|
|||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'item-page'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'item-page'})]
|
||||||
remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']})]
|
remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']}), dict(attrs={'alt':['Poprzednia strona', 'Następna strona']})]
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'cedtag'})
|
remove_tags_after = dict(name='div', attrs={'class':'cedtag'})
|
||||||
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')]
|
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?tmpl=component&print=1&layout=default&page='
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup.findAll(name='a'):
|
for a in soup.findAll(name='a'):
|
||||||
@ -26,5 +30,4 @@ class Kosmonauta(BasicNewsRecipe):
|
|||||||
href = a['href']
|
href = a['href']
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
a['href'] = self.INDEX + href
|
a['href'] = self.INDEX + href
|
||||||
return soup
|
return soup
|
||||||
|
|
@ -2,8 +2,7 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class KrytykaPolitycznaRecipe(BasicNewsRecipe):
|
class KrytykaPolitycznaRecipe(BasicNewsRecipe):
|
||||||
__license__ = 'GPL v3'
|
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
__author__ = u'intromatyk <intromatyk@gmail.com>'
|
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
version = 1
|
version = 1
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ http://www.ledevoir.com/
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.utils.magick import Image
|
|
||||||
|
|
||||||
class ledevoir(BasicNewsRecipe):
|
class ledevoir(BasicNewsRecipe):
|
||||||
author = 'Lorenzo Vigentini'
|
author = 'Lorenzo Vigentini'
|
||||||
@ -129,12 +128,12 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
img = Image()
|
img = Image()
|
||||||
img.open(iurl)
|
img.open(iurl)
|
||||||
# width, height = img.size
|
# width, height = img.size
|
||||||
# print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
|
# print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
|
||||||
if img < 0:
|
if img < 0:
|
||||||
raise RuntimeError('Out of memory')
|
raise RuntimeError('Out of memory')
|
||||||
img.set_compression_quality(30)
|
img.set_compression_quality(30)
|
||||||
img.save(iurl)
|
img.save(iurl)
|
||||||
return soup
|
return soup
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,33 +1,23 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=UTF-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AListApart (BasicNewsRecipe):
|
class AListApart (BasicNewsRecipe):
|
||||||
__author__ = u'Marc Busqué <marc@lamarciana.com>'
|
__author__ = 'Marc Busqué <marc@lamarciana.com>'
|
||||||
__url__ = 'http://www.lamarciana.com'
|
__url__ = 'http://www.lamarciana.com'
|
||||||
__version__ = '1.0'
|
__version__ = '2.0'
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = u'2012, Marc Busqué <marc@lamarciana.com>'
|
__copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
|
||||||
title = u'A List Apart'
|
title = u'A List Apart'
|
||||||
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.'
|
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
tags = 'web development, software'
|
tags = 'web development, software'
|
||||||
oldest_article = 120
|
oldest_article = 120
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
no_stylesheets = True
|
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
cover_url = u'http://alistapart.com/pix/alalogo.gif'
|
cover_url = u'http://alistapart.com/pix/alalogo.gif'
|
||||||
keep_only_tags = [
|
extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
|
||||||
dict(name='div', attrs={'id': 'content'})
|
|
||||||
]
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='ul', attrs={'id': 'metastuff'}),
|
|
||||||
dict(name='div', attrs={'class': 'discuss'}),
|
|
||||||
dict(name='div', attrs={'class': 'discuss'}),
|
|
||||||
dict(name='div', attrs={'id': 'learnmore'}),
|
|
||||||
]
|
|
||||||
remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
|
|
||||||
extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}'
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'A List Apart', u'http://www.alistapart.com/site/rss'),
|
(u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
|
||||||
]
|
]
|
||||||
|
88
recipes/magazyn_consido.recipe
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
magazynconsido.pl/
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.utils.magick import Image
|
||||||
|
|
||||||
|
class magazynconsido(BasicNewsRecipe):
|
||||||
|
title = u'Magazyn Consido'
|
||||||
|
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com> ,teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'Portal dla architektów i projektantów'
|
||||||
|
masthead_url='http://qualitypixels.pl/wp-content/themes/airlock/advance/inc/timthumb.php?src=http://qualitypixels.pl/wp-content/uploads/2012/01/logotyp-magazynconsido-11.png&w=455&zc=1'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'h1'))
|
||||||
|
keep_only_tags.append(dict(name = 'p'))
|
||||||
|
keep_only_tags.append(dict(attrs = {'class' : 'navigation'}))
|
||||||
|
remove_tags =[dict(attrs = {'style' : 'font-size: x-small;' })]
|
||||||
|
|
||||||
|
remove_tags_after =[dict(attrs = {'class' : 'navigation' })]
|
||||||
|
|
||||||
|
extra_css=''' img {max-width:30%; max-height:30%; display: block; margin-left: auto; margin-right: auto;}
|
||||||
|
h1 {text-align: center;}'''
|
||||||
|
|
||||||
|
def parse_index(self): #(kk)
|
||||||
|
soup = self.index_to_soup('http://feeds.feedburner.com/magazynconsido?format=xml')
|
||||||
|
feeds = []
|
||||||
|
articles = {}
|
||||||
|
sections = []
|
||||||
|
section = ''
|
||||||
|
|
||||||
|
for item in soup.findAll('item') :
|
||||||
|
section = self.tag_to_string(item.category)
|
||||||
|
if not articles.has_key(section) :
|
||||||
|
sections.append(section)
|
||||||
|
articles[section] = []
|
||||||
|
article_url = self.tag_to_string(item.guid)
|
||||||
|
article_title = self.tag_to_string(item.title)
|
||||||
|
article_date = self.tag_to_string(item.pubDate)
|
||||||
|
article_description = self.tag_to_string(item.description)
|
||||||
|
articles[section].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date, 'description' : article_description })
|
||||||
|
|
||||||
|
for section in sections :
|
||||||
|
if section == 'Video':
|
||||||
|
feeds.append((section, articles[section]))
|
||||||
|
feeds.pop()
|
||||||
|
else:
|
||||||
|
feeds.append((section, articles[section]))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
apage = soup.find('div', attrs={'class':'wp-pagenavi'})
|
||||||
|
if apage is not None:
|
||||||
|
nexturl = soup.find('a', attrs={'class':'nextpostslink'})
|
||||||
|
soup2 = self.index_to_soup(nexturl['href'])
|
||||||
|
pagetext = soup2.findAll('p')
|
||||||
|
for tag in pagetext:
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, tag)
|
||||||
|
|
||||||
|
while appendtag.find('div', attrs={'class': ['height: 35px;', 'post-meta', 'addthis_toolbox addthis_default_style addthis_', 'post-meta-bottom', 'block_recently_post', 'fbcomments', 'pin-it-button', 'pages', 'navigation']}) is not None:
|
||||||
|
appendtag.find('div', attrs={'class': ['height: 35px;', 'post-meta', 'addthis_toolbox addthis_default_style addthis_', 'post-meta-bottom', 'block_recently_post', 'fbcomments', 'pin-it-button', 'pages', 'navigation']}).replaceWith('')
|
||||||
|
|
||||||
|
def preprocess_html(self, soup): #(kk)
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
return self.adeify_images(soup)
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first):
|
||||||
|
#process all the images
|
||||||
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
|
iurl = tag['src']
|
||||||
|
img = Image()
|
||||||
|
img.open(iurl)
|
||||||
|
if img < 0:
|
||||||
|
raise RuntimeError('Out of memory')
|
||||||
|
img.type = "GrayscaleType"
|
||||||
|
img.save(iurl)
|
||||||
|
return soup
|
35
recipes/media2.recipe
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = 'teepel'
|
||||||
|
|
||||||
|
'''
|
||||||
|
media2.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class media2_pl(BasicNewsRecipe):
|
||||||
|
title = u'Media2'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'Media2.pl to jeden z najczęściej odwiedzanych serwisów dla profesjonalistów z branży medialnej, telekomunikacyjnej, public relations oraz nowych technologii.'
|
||||||
|
masthead_url='http://media2.pl/res/logo/www.png'
|
||||||
|
remove_empty_feeds= True
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
|
||||||
|
extra_css = '''.news-lead{font-weight: bold; }'''
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-item tpl-big'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'span', attrs = {'class' : 'news-comments'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'item-sidebar'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'news-tags'}))
|
||||||
|
|
||||||
|
feeds = [(u'Media2', u'http://feeds.feedburner.com/media2')]
|
@ -6,10 +6,10 @@ import time
|
|||||||
|
|
||||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||||
title = u'Metro UK'
|
title = u'Metro UK'
|
||||||
description = 'News as provided by The Metro -UK'
|
description = 'News from The Metro, UK'
|
||||||
#timefmt = ''
|
#timefmt = ''
|
||||||
__author__ = 'fleclerc & Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
#last update 20/1/13
|
#last update 4/4/13
|
||||||
#cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
#cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
||||||
|
|
||||||
cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
|
cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
|
||||||
@ -22,7 +22,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
|
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
||||||
|
compress_news_images = True
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
articles = {}
|
articles = {}
|
||||||
key = None
|
key = None
|
||||||
|
@ -2,13 +2,14 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
class Mlody_technik(BasicNewsRecipe):
|
class Mlody_technik(BasicNewsRecipe):
|
||||||
title = u'Młody technik'
|
title = u'Młody technik'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Młody technik'
|
description = u'Młody technik'
|
||||||
category = 'science'
|
category = 'science'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
#cover_url = 'http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
|
#cover_url = 'http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
extra_css = 'img.alignleft {float: left; margin-right: 5px;}'
|
||||||
preprocess_regexps = [(re.compile(r"<h4>Podobne</h4>", re.IGNORECASE), lambda m: '')]
|
preprocess_regexps = [(re.compile(r"<h4>Podobne</h4>", re.IGNORECASE), lambda m: '')]
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -17,18 +18,18 @@ class Mlody_technik(BasicNewsRecipe):
|
|||||||
keep_only_tags = [dict(id='content')]
|
keep_only_tags = [dict(id='content')]
|
||||||
remove_tags = [dict(attrs={'class':'st-related-posts'})]
|
remove_tags = [dict(attrs={'class':'st-related-posts'})]
|
||||||
remove_tags_after = dict(attrs={'class':'entry-content clearfix'})
|
remove_tags_after = dict(attrs={'class':'entry-content clearfix'})
|
||||||
feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'),
|
feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'),
|
||||||
#(u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'),
|
#(u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'),
|
||||||
(u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'),
|
(u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'),
|
||||||
(u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'),
|
(u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'),
|
||||||
(u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'),
|
(u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'),
|
||||||
(u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'),
|
(u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'),
|
||||||
(u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'),
|
(u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'),
|
||||||
(u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')]
|
(u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.mt.com.pl/')
|
soup = self.index_to_soup('http://www.mt.com.pl/')
|
||||||
tag = soup.find(attrs={'class':'xoxo'})
|
tag = soup.find(attrs={'class':'xoxo'})
|
||||||
if tag:
|
if tag:
|
||||||
self.cover_url = tag.find('img')['src']
|
self.cover_url = tag.find('img')['src']
|
||||||
return getattr(self, 'cover_url', self.cover_url)
|
return getattr(self, 'cover_url', self.cover_url)
|
26
recipes/mobilna.recipe
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = 'MrStefan'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.mobilna.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class mobilna(BasicNewsRecipe):
|
||||||
|
title = u'Mobilna.pl'
|
||||||
|
__author__ = 'MrStefan <mrstefaan@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'twoja mobilna strona'
|
||||||
|
#masthead_url=''
|
||||||
|
remove_empty_feeds= True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
use_embedded_content = True
|
||||||
|
#keep_only_tags =[dict(attrs={'class':'Post'})]
|
||||||
|
|
||||||
|
feeds = [(u'Artykuły', u'http://mobilna.pl/feed/')]
|
50
recipes/mojegotowanie.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#!usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = 'MrStefan, teepel'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.mojegotowanie.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class mojegotowanie(BasicNewsRecipe):
|
||||||
|
title = u'Moje Gotowanie'
|
||||||
|
__author__ = 'MrStefan <mrstefaan@gmail.com>, teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'Gotowanie to Twoja pasja? Uwielbiasz sałatki? Lubisz grillować? Przepisy kulinarne doskonałe na wszystkie okazje znajdziesz na www.mojegotowanie.pl.'
|
||||||
|
masthead_url='http://www.mojegotowanie.pl/extension/selfstart/design/self/images/top_c2.gif'
|
||||||
|
cover_url = 'http://www.mojegotowanie.pl/extension/selfstart/design/self/images/mgpl/mojegotowanie.gif'
|
||||||
|
remove_empty_feeds= True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'content'}))
|
||||||
|
|
||||||
|
feeds = [(u'Artykuły', u'http://mojegotowanie.pl/rss/feed/artykuly'),
|
||||||
|
(u'Przepisy', u'http://mojegotowanie.pl/rss/feed/przepisy')]
|
||||||
|
|
||||||
|
def parse_feeds(self):
|
||||||
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
|
for feed in feeds:
|
||||||
|
for article in feed.articles[:]:
|
||||||
|
if 'film' in article.title:
|
||||||
|
feed.articles.remove(article)
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
link = article.get('link')
|
||||||
|
if 'Clayout0Cset0Cprint0' in link:
|
||||||
|
return link
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
segment = url.split('/')
|
||||||
|
URLPart = segment[-2]
|
||||||
|
URLPart = URLPart.replace('0L0Smojegotowanie0Bpl0Clayout0Cset0Cprint0C', '/')
|
||||||
|
URLPart = URLPart.replace('0I', '_')
|
||||||
|
URLPart = URLPart.replace('0C', '/')
|
||||||
|
return 'http://www.mojegotowanie.pl/layout/set/print' + URLPart
|
27
recipes/najwyzszy_czas.recipe
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
nczas.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class nczas(BasicNewsRecipe):
|
||||||
|
title = u'Najwy\u017cszy Czas'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description ='Wiadomości z nczas.com'
|
||||||
|
INDEX='http://nczas.com'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = True
|
||||||
|
remove_empty_feeds= True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
remove_javascript=True
|
||||||
|
remove_attributes = ['style']
|
||||||
|
no_stylesheets=True
|
||||||
|
|
||||||
|
feeds = [(u'Najwyższy Czas', u'http://nczas.com/feed/')]
|
@ -1,16 +1,18 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
class NaukawPolsce(BasicNewsRecipe):
|
class NaukawPolsce(BasicNewsRecipe):
|
||||||
title = u'Nauka w Polsce'
|
title = u'Nauka w Polsce'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.'
|
description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.'
|
||||||
category = 'science'
|
category = 'science'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://www.naukawpolsce.pap.pl/Themes/Pap/images/logo-pl.gif'
|
cover_url = 'http://www.naukawpolsce.pap.pl/Themes/Pap/images/logo-pl.gif'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
extra_css = '.miniaturka {float: left; margin-right: 5px; max-width: 350px;} .miniaturka-dol-strony {display: inline-block; margin: 0 15px; width: 120px;}'
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
index = 'http://www.naukawpolsce.pl'
|
index = 'http://www.naukawpolsce.pl'
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})]
|
||||||
remove_tags = [dict(name='div', attrs={'class':'tagi'})]
|
remove_tags = [dict(name='div', attrs={'class':'tagi'})]
|
||||||
@ -23,8 +25,8 @@ class NaukawPolsce(BasicNewsRecipe):
|
|||||||
url = self.index + i.h1.a['href']
|
url = self.index + i.h1.a['href']
|
||||||
date = '' #i.span.string
|
date = '' #i.span.string
|
||||||
articles.append({'title' : title,
|
articles.append({'title' : title,
|
||||||
'url' : url,
|
'url' : url,
|
||||||
'date' : date,
|
'date' : date,
|
||||||
'description' : ''
|
'description' : ''
|
||||||
})
|
})
|
||||||
return articles
|
return articles
|
||||||
@ -44,4 +46,4 @@ class NaukawPolsce(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for p in soup.findAll(name='p', text=re.compile(' ')):
|
for p in soup.findAll(name='p', text=re.compile(' ')):
|
||||||
p.extract()
|
p.extract()
|
||||||
return soup
|
return soup
|
@ -1,64 +1,44 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
|
|
||||||
'''
|
|
||||||
newyorker.com
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
class NewYorker(BasicNewsRecipe):
|
class NewYorker(BasicNewsRecipe):
|
||||||
title = 'The New Yorker'
|
|
||||||
__author__ = 'Darko Miletic'
|
|
||||||
description = 'The best of US journalism'
|
|
||||||
oldest_article = 15
|
|
||||||
language = 'en'
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
use_embedded_content = False
|
|
||||||
publisher = 'Conde Nast Publications'
|
|
||||||
category = 'news, politics, USA'
|
|
||||||
encoding = 'cp1252'
|
|
||||||
publication_type = 'magazine'
|
|
||||||
masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif'
|
|
||||||
extra_css = """
|
|
||||||
body {font-family: "Times New Roman",Times,serif}
|
|
||||||
.articleauthor{color: #9F9F9F;
|
|
||||||
font-family: Arial, sans-serif;
|
|
||||||
font-size: small;
|
|
||||||
text-transform: uppercase}
|
|
||||||
.rubric,.dd,h6#credit{color: #CD0021;
|
|
||||||
font-family: Arial, sans-serif;
|
|
||||||
font-size: small;
|
|
||||||
text-transform: uppercase}
|
|
||||||
.descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
|
|
||||||
.dd,h6#credit{color: gray}
|
|
||||||
.c{display: block}
|
|
||||||
.caption,h2#articleintro{font-style: italic}
|
|
||||||
.caption{font-size: small}
|
|
||||||
"""
|
|
||||||
|
|
||||||
conversion_options = {
|
|
||||||
'comment' : description
|
|
||||||
, 'tags' : category
|
|
||||||
, 'publisher' : publisher
|
|
||||||
, 'language' : language
|
|
||||||
}
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})]
|
title = u'New Yorker Magazine'
|
||||||
remove_tags = [
|
newyorker_prefix = 'http://m.newyorker.com'
|
||||||
dict(name=['meta','iframe','base','link','embed','object'])
|
description = u'Content from the New Yorker website'
|
||||||
,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] })
|
fp_tag = 'CAN_TC'
|
||||||
,dict(attrs={'id':['show-header','show-footer'] })
|
|
||||||
]
|
|
||||||
remove_tags_after = dict(attrs={'class':'entry-content'})
|
|
||||||
remove_attributes = ['lang']
|
|
||||||
feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'
|
||||||
return url + '?printable=true¤tPage=all'
|
|
||||||
|
|
||||||
def image_url_processor(self, baseurl, url):
|
compress_news_images = True
|
||||||
return url.strip()
|
compress_news_images_auto_size = 8
|
||||||
|
scale_news_images_to_device = False
|
||||||
|
scale_news_images = (768, 1024)
|
||||||
|
|
||||||
|
url_list = []
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
extra_css = '''
|
||||||
|
.byline { font-size:xx-small; font-weight: bold;}
|
||||||
|
h3 { margin-bottom: 6px; }
|
||||||
|
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
|
'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
|
||||||
|
|
||||||
|
remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
|
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
|
||||||
@ -68,13 +48,233 @@ class NewYorker(BasicNewsRecipe):
|
|||||||
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
|
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def fixChars(self,string):
|
||||||
for item in soup.findAll(style=True):
|
# Replace lsquo (\x91)
|
||||||
del item['style']
|
fixed = re.sub("\x91","‘",string)
|
||||||
auth = soup.find(attrs={'id':'articleauthor'})
|
# Replace rsquo (\x92)
|
||||||
if auth:
|
fixed = re.sub("\x92","’",fixed)
|
||||||
alink = auth.find('a')
|
# Replace ldquo (\x93)
|
||||||
if alink and alink.string is not None:
|
fixed = re.sub("\x93","“",fixed)
|
||||||
txt = alink.string
|
# Replace rdquo (\x94)
|
||||||
alink.replaceWith(txt)
|
fixed = re.sub("\x94","”",fixed)
|
||||||
|
# Replace ndash (\x96)
|
||||||
|
fixed = re.sub("\x96","–",fixed)
|
||||||
|
# Replace mdash (\x97)
|
||||||
|
fixed = re.sub("\x97","—",fixed)
|
||||||
|
fixed = re.sub("’","’",fixed)
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
def massageNCXText(self, description):
|
||||||
|
# Kindle TOC descriptions won't render certain characters
|
||||||
|
if description:
|
||||||
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
|
# Replace '&' with '&'
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
|
return self.fixChars(massaged)
|
||||||
|
else:
|
||||||
|
return description
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first:
|
||||||
|
picdiv = soup.find('body').find('img')
|
||||||
|
if picdiv is not None:
|
||||||
|
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||||
|
xtitle = article.text_summary.strip()
|
||||||
|
if len(xtitle) == 0:
|
||||||
|
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||||
|
if desc is not None:
|
||||||
|
article.summary = article.text_summary = desc['content']
|
||||||
|
shortparagraph = ""
|
||||||
|
## try:
|
||||||
|
if len(article.text_summary.strip()) == 0:
|
||||||
|
articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
|
||||||
|
if articlebodies:
|
||||||
|
for articlebody in articlebodies:
|
||||||
|
if articlebody:
|
||||||
|
paras = articlebody.findAll('p')
|
||||||
|
for p in paras:
|
||||||
|
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||||
|
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||||
|
if len(refparagraph) > 0:
|
||||||
|
if len(refparagraph) > 70: #approximately one line of text
|
||||||
|
newpara = shortparagraph + refparagraph
|
||||||
|
article.summary = article.text_summary = newpara.strip()
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
shortparagraph = refparagraph + " "
|
||||||
|
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||||
|
shortparagraph = shortparagraph + "- "
|
||||||
|
else:
|
||||||
|
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||||||
|
## except:
|
||||||
|
## self.log("Error creating article descriptions")
|
||||||
|
## return
|
||||||
|
|
||||||
|
|
||||||
|
def strip_anchors(self,soup):
|
||||||
|
paras = soup.findAll(True)
|
||||||
|
for para in paras:
|
||||||
|
aTags = para.findAll('a')
|
||||||
|
for a in aTags:
|
||||||
|
if a.img is None:
|
||||||
|
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
dateline = soup.find('div','published')
|
||||||
|
byline = soup.find('div','byline')
|
||||||
|
title = soup.find('h1','entry-title')
|
||||||
|
if title is None:
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
if byline is None:
|
||||||
|
title.append(dateline)
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
byline.append(dateline)
|
||||||
|
return self.strip_anchors(soup)
|
||||||
|
|
||||||
|
def load_global_nav(self,soup):
|
||||||
|
seclist = []
|
||||||
|
ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
|
||||||
|
if ul is not None:
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
if li.a is not None:
|
||||||
|
securl = li.a['href']
|
||||||
|
if securl != '/' and securl != '/magazine' and securl.startswith('/'):
|
||||||
|
seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
|
||||||
|
return seclist
|
||||||
|
|
||||||
|
def exclude_url(self,url):
|
||||||
|
if url in self.url_list:
|
||||||
|
return True
|
||||||
|
if not url.endswith('html'):
|
||||||
|
return True
|
||||||
|
if 'goings-on-about-town-app' in url:
|
||||||
|
return True
|
||||||
|
if 'something-to-be-thankful-for' in url:
|
||||||
|
return True
|
||||||
|
if '/shouts/' in url:
|
||||||
|
return True
|
||||||
|
if 'out-loud' in url:
|
||||||
|
return True
|
||||||
|
if '/rss/' in url:
|
||||||
|
return True
|
||||||
|
if '/video-' in url:
|
||||||
|
return True
|
||||||
|
self.url_list.append(url)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def load_index_page(self,soup):
|
||||||
|
article_list = []
|
||||||
|
for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
|
||||||
|
h2 = div.h2
|
||||||
|
if h2 is not None:
|
||||||
|
a = h2.a
|
||||||
|
if a is not None:
|
||||||
|
url = a['href']
|
||||||
|
if not self.exclude_url(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.newyorker_prefix+url
|
||||||
|
byline = h2.span
|
||||||
|
if byline is not None:
|
||||||
|
author = self.tag_to_string(byline)
|
||||||
|
if author.startswith('by '):
|
||||||
|
author.replace('by ','')
|
||||||
|
byline.extract()
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
if h2.br is not None:
|
||||||
|
h2.br.replaceWith(' ')
|
||||||
|
title = self.tag_to_string(h2)
|
||||||
|
desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
|
||||||
|
if desc is not None:
|
||||||
|
description = self.tag_to_string(desc)
|
||||||
|
else:
|
||||||
|
description = ''
|
||||||
|
article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
|
||||||
|
ul = div.find('ul','feature-blurb-links')
|
||||||
|
if ul is not None:
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
a = li.a
|
||||||
|
if a is not None:
|
||||||
|
url = a['href']
|
||||||
|
if not self.exclude_url(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.newyorker_prefix+url
|
||||||
|
if a.br is not None:
|
||||||
|
a.br.replaceWith(' ')
|
||||||
|
title = '>>'+self.tag_to_string(a)
|
||||||
|
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||||
|
for h3 in soup.findAll('h3','header'):
|
||||||
|
a = h3.a
|
||||||
|
if a is not None:
|
||||||
|
url = a['href']
|
||||||
|
if not self.exclude_url(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.newyorker_prefix+url
|
||||||
|
byline = h3.span
|
||||||
|
if byline is not None:
|
||||||
|
author = self.tag_to_string(byline)
|
||||||
|
if author.startswith('by '):
|
||||||
|
author = author.replace('by ','')
|
||||||
|
byline.extract()
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
if h3.br is not None:
|
||||||
|
h3.br.replaceWith(' ')
|
||||||
|
title = self.tag_to_string(h3).strip()
|
||||||
|
article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
|
||||||
|
return article_list
|
||||||
|
|
||||||
|
def load_global_section(self,securl):
|
||||||
|
article_list = []
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(securl)
|
||||||
|
except:
|
||||||
|
return article_list
|
||||||
|
if '/blogs/' not in securl:
|
||||||
|
return self.load_index_page(soup)
|
||||||
|
for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
|
||||||
|
h3 = div.h3
|
||||||
|
if h3 is not None:
|
||||||
|
a = h3.a
|
||||||
|
if a is not None:
|
||||||
|
url = a['href']
|
||||||
|
if not self.exclude_url(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = self.newyorker_prefix+url
|
||||||
|
if h3.br is not None:
|
||||||
|
h3.br.replaceWith(' ')
|
||||||
|
title = self.tag_to_string(h3)
|
||||||
|
article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
|
||||||
|
return article_list
|
||||||
|
|
||||||
|
def filter_ans(self, ans) :
|
||||||
|
total_article_count = 0
|
||||||
|
idx = 0
|
||||||
|
idx_max = len(ans)-1
|
||||||
|
while idx <= idx_max:
|
||||||
|
if True: #self.verbose
|
||||||
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||||
|
for article in ans[idx][1]:
|
||||||
|
total_article_count += 1
|
||||||
|
if True: #self.verbose
|
||||||
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
|
article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
|
||||||
|
idx = idx+1
|
||||||
|
self.log( "Queued %d articles" % total_article_count )
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
ans = []
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(self.newyorker_prefix)
|
||||||
|
except:
|
||||||
|
return ans
|
||||||
|
seclist = self.load_global_nav(soup)
|
||||||
|
ans.append(('Front Page',self.load_index_page(soup)))
|
||||||
|
for (sectitle,securl) in seclist:
|
||||||
|
ans.append((sectitle,self.load_global_section(securl)))
|
||||||
|
return self.filter_ans(ans)
|
||||||
|
|
||||||
|
@ -1,17 +1,19 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Niebezpiecznik_pl(BasicNewsRecipe):
|
class Niebezpiecznik_pl(BasicNewsRecipe):
|
||||||
title = u'Niebezpiecznik.pl'
|
title = u'Niebezpiecznik.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Niebezpiecznik.pl – o bezpieczeństwie i nie...'
|
description = u'Niebezpiecznik.pl – o bezpieczeństwie i nie...'
|
||||||
category = 'hacking, IT'
|
category = 'hacking, IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
|
extra_css = '.entry {margin-top: 25px;}'
|
||||||
|
remove_attrs = ['style']
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png'
|
cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png'
|
||||||
remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})]
|
remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})]
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})]
|
keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})]
|
||||||
feeds = [(u'Wiadomości', u'http://feeds.feedburner.com/niebezpiecznik/'),
|
feeds = [(u'Wiadomości', u'http://feeds.feedburner.com/niebezpiecznik/'),
|
||||||
('Blog', 'http://feeds.feedburner.com/niebezpiecznik/linkblog/')]
|
('Blog', 'http://feeds.feedburner.com/niebezpiecznik/linkblog/')]
|
@ -12,6 +12,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 20
|
max_articles_per_feed = 20
|
||||||
#auto_cleanup = True
|
#auto_cleanup = True
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
|
compress_news_images = True
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.nme.com/component/subscribe')
|
soup = self.index_to_soup('http://www.nme.com/component/subscribe')
|
||||||
@ -27,7 +28,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|||||||
br.open_novisit(cov2)
|
br.open_novisit(cov2)
|
||||||
cover_url = str(cov2)
|
cover_url = str(cov2)
|
||||||
except:
|
except:
|
||||||
cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
||||||
|
31
recipes/nowiny_rybnik.recipe
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NowinyRybnik(BasicNewsRecipe):
|
||||||
|
title = u'Nowiny - Rybnik'
|
||||||
|
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Tygodnik Regionalny NOWINY. Ogłoszenia drobne, wiadomości i wydarzenia z regionu Rybnika i okolic'
|
||||||
|
oldest_article = 7
|
||||||
|
masthead_url = 'http://www.nowiny.rybnik.pl/logo/logo.jpg'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
keep_only_tags = [(dict(name='div', attrs={'id': 'drukuj'}))]
|
||||||
|
|
||||||
|
remove_tags = []
|
||||||
|
remove_tags.append(dict(name='div', attrs={'id': 'footer'}))
|
||||||
|
|
||||||
|
feeds = [(u'Wszystkie artykuły', u'http://www.nowiny.rybnik.pl/rss,artykuly,dzial,0,miasto,0,ile,25.xml')]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
41
recipes/osw.recipe
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
http://www.osw.waw.pl - Osrodek studiow wschodnich
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class OSW_Recipe(BasicNewsRecipe):
|
||||||
|
|
||||||
|
language = 'pl'
|
||||||
|
title = u'Ośrodek Studiów Wschodnich'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
INDEX='http://www.osw.waw.pl'
|
||||||
|
description = u'Ośrodek Studiów Wschodnich im. Marka Karpia. Centre for Eastern Studies.'
|
||||||
|
category = u'News'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
cover_url=''
|
||||||
|
remove_empty_feeds= True
|
||||||
|
no_stylesheets=True
|
||||||
|
remove_javascript = True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
#this line should show title of the article, but it doesnt work
|
||||||
|
keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'print-title'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'print-submitted'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'print-content'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'table', attrs = {'id' : 'attachments'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'print-submitted'}))
|
||||||
|
|
||||||
|
feeds = [(u'OSW', u'http://www.osw.waw.pl/pl/rss.xml')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('http://www.osw.waw.pl/pl/', 'http://www.osw.waw.pl/pl/print/')
|
@ -1,11 +1,12 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
class OSWorld(BasicNewsRecipe):
|
class OSWorld(BasicNewsRecipe):
|
||||||
title = u'OSWorld.pl'
|
title = u'OSWorld.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!'
|
description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!'
|
||||||
category = 'OS, IT, open source, Linux'
|
category = 'OS, IT, open source, Linux'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png'
|
cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png'
|
||||||
|
extra_css = 'img.alignleft {float: left; margin-right: 5px;}'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -14,7 +15,7 @@ class OSWorld(BasicNewsRecipe):
|
|||||||
keep_only_tags = [dict(id=['dzial', 'posts'])]
|
keep_only_tags = [dict(id=['dzial', 'posts'])]
|
||||||
remove_tags = [dict(attrs={'class':'post-comments'})]
|
remove_tags = [dict(attrs={'class':'post-comments'})]
|
||||||
remove_tags_after = dict(attrs={'class':'entry clr'})
|
remove_tags_after = dict(attrs={'class':'entry clr'})
|
||||||
feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')]
|
feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')]
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
def append_page(self, soup, appendtag):
|
||||||
tag = appendtag.find(attrs={'id':'paginacja'})
|
tag = appendtag.find(attrs={'id':'paginacja'})
|
||||||
@ -30,4 +31,4 @@ class OSWorld(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
@ -1,20 +1,21 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
class PC_Centre(BasicNewsRecipe):
|
class PC_Centre(BasicNewsRecipe):
|
||||||
title = u'PC Centre'
|
title = u'PC Centre'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.'
|
description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.'
|
||||||
category = 'IT'
|
category = 'IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
masthead_url= 'http://pccentre.pl/views/images/logo.gif'
|
masthead_url= 'http://pccentre.pl/views/images/logo.gif'
|
||||||
cover_url= 'http://pccentre.pl/views/images/logo.gif'
|
cover_url= 'http://pccentre.pl/views/images/logo.gif'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
#keep_only_tags= [dict(id='content')]
|
#keep_only_tags= [dict(id='content')]
|
||||||
#remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
|
#remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
|
||||||
remove_tags=[dict(attrs={'class':'logo_print'})]
|
remove_tags=[dict(attrs={'class':'logo_print'})]
|
||||||
feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')]
|
feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url.replace('show', 'print')
|
return url.replace('show', 'print')
|
41
recipes/ppe_pl.recipe
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class ppeRecipe(BasicNewsRecipe):
|
||||||
|
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
|
||||||
|
title = u'ppe.pl'
|
||||||
|
category = u'News'
|
||||||
|
description = u'Portal o konsolach i grach wideo.'
|
||||||
|
cover_url=''
|
||||||
|
remove_empty_feeds= True
|
||||||
|
no_stylesheets=True
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100000
|
||||||
|
recursions = 0
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
simultaneous_downloads = 2
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-heading'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'tresc-poziom'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria1'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria2'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'bateria3'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'news-photo'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'fbl'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'info'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'links'}))
|
||||||
|
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'style' : 'padding: 4px'}))
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Newsy', 'feed://ppe.pl/rss/rss.xml'),
|
||||||
|
]
|
33
recipes/presseurop.recipe
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.presseurop.eu/pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
|
class presseurop(BasicNewsRecipe):
|
||||||
|
title = u'Presseurop'
|
||||||
|
description = u'Najlepsze artykuły z prasy europejskiej'
|
||||||
|
language = 'pl'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Polityka', u'http://www.presseurop.eu/pl/taxonomy/term/1/%2A/feed'),
|
||||||
|
(u'Społeczeństwo', u'http://www.presseurop.eu/pl/taxonomy/term/2/%2A/feed'),
|
||||||
|
(u'Gospodarka', u'http://www.presseurop.eu/pl/taxonomy/term/3/%2A/feed'),
|
||||||
|
(u'Kultura i debaty', u'http://www.presseurop.eu/pl/taxonomy/term/4/%2A/feed'),
|
||||||
|
(u'UE i Świat', u'http://www.presseurop.eu/pl/taxonomy/term/5/%2A/feed')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'\|.*</title>', re.DOTALL|re.IGNORECASE),
|
||||||
|
lambda match: '</title>'),
|
||||||
|
]
|
@ -23,8 +23,8 @@ class PublicoPT(BasicNewsRecipe):
|
|||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
|
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
|
||||||
|
|
||||||
keep_only_tags = [dict(attrs={'class':['content-noticia-title','artigoHeader','ECOSFERA_MANCHETE','noticia','textoPrincipal','ECOSFERA_texto_01']})]
|
keep_only_tags = [dict(attrs={'class':['hentry article single']})]
|
||||||
remove_tags = [dict(attrs={'class':['options','subcoluna']})]
|
remove_tags = [dict(attrs={'class':['entry-options entry-options-above group','entry-options entry-options-below group', 'module tag-list']})]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Geral', u'http://feeds.feedburner.com/publicoRSS'),
|
(u'Geral', u'http://feeds.feedburner.com/publicoRSS'),
|
||||||
|
35
recipes/res_publica.recipe
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class ResPublicaNowaRecipe(BasicNewsRecipe):
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
version = 1
|
||||||
|
|
||||||
|
title = u'Res Publica Nowa'
|
||||||
|
category = u'News'
|
||||||
|
description = u'Portal kulturalno-społecznego kwartalnika o profilu liberalnym, wydawany przez Fundację Res Publica'
|
||||||
|
cover_url=''
|
||||||
|
remove_empty_feeds= True
|
||||||
|
no_stylesheets=True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100000
|
||||||
|
recursions = 0
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Artykuly', 'feed://publica.pl/feed'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
@ -7,7 +7,6 @@ sfgate.com
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
|
||||||
|
|
||||||
class SanFranciscoChronicle(BasicNewsRecipe):
|
class SanFranciscoChronicle(BasicNewsRecipe):
|
||||||
title = u'San Francisco Chronicle'
|
title = u'San Francisco Chronicle'
|
||||||
@ -19,16 +18,7 @@ class SanFranciscoChronicle(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
|
||||||
remove_tags_before = {'id':'printheader'}
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div',attrs={'id':'printheader'})
|
|
||||||
,dict(name='a', attrs={'href':re.compile('http://ads\.pheedo\.com.*')})
|
|
||||||
,dict(name='div',attrs={'id':'footer'})
|
|
||||||
]
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
|
h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
|
||||||
@ -43,33 +33,13 @@ class SanFranciscoChronicle(BasicNewsRecipe):
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Top News Stories', u'http://www.sfgate.com/rss/feeds/news.xml')
|
(u'Bay Area News', u'http://www.sfgate.com/bayarea/feed/Bay-Area-News-429.php'),
|
||||||
|
(u'City Insider', u'http://www.sfgate.com/default/feed/City-Insider-Blog-573.php'),
|
||||||
|
(u'Crime Scene', u'http://www.sfgate.com/rss/feed/Crime-Scene-Blog-599.php'),
|
||||||
|
(u'Education News', u'http://www.sfgate.com/education/feed/Education-News-from-SFGate-430.php'),
|
||||||
|
(u'National News', u'http://www.sfgate.com/rss/feed/National-News-RSS-Feed-435.php'),
|
||||||
|
(u'Weird News', u'http://www.sfgate.com/weird/feed/Weird-News-RSS-Feed-433.php'),
|
||||||
|
(u'World News', u'http://www.sfgate.com/rss/feed/World-News-From-SFGate-432.php'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self,url):
|
|
||||||
url= url +"&type=printable"
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
print str(article['title_detail']['value'])
|
|
||||||
url = article.get('guid',None)
|
|
||||||
url = "http://www.sfgate.com/cgi-bin/article.cgi?f="+url
|
|
||||||
if "Presented By:" in str(article['title_detail']['value']):
|
|
||||||
url = ''
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,30 +1,30 @@
|
|||||||
|
# vim:fileencoding=UTF-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2011, Eddie Lau'
|
__copyright__ = '2011-2013, Eddie Lau'
|
||||||
|
|
||||||
# data source: normal, mobile
|
# data source: normal, mobile
|
||||||
__Source__ = 'mobile'
|
__Source__ = 'mobile'
|
||||||
# please replace the following "True" with "False". (Default: True)
|
# please replace the following "True" with "False". (Default: True)
|
||||||
__MakePeriodical__ = True
|
__MakePeriodical__ = True
|
||||||
# Turn below to True if your device supports display of CJK titles (Default: False)
|
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||||
__UseChineseTitle__ = False
|
__UseChineseTitle__ = True
|
||||||
# Set it to False if you want to skip images (Default: True)
|
# Set it to False if you want to skip images (Default: True)
|
||||||
__KeepImages__ = True
|
__KeepImages__ = True
|
||||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||||
__IncludeSummary__ = False
|
__IncludeSummary__ = True
|
||||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||||
__IncludeThumbnails__ = True
|
__IncludeThumbnails__ = True
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2013/03/31 -- fix cover retrieval code and heading size, and remove in summary
|
||||||
2011/12/29 -- first version done
|
2011/12/29 -- first version done
|
||||||
TODO:
|
|
||||||
* use alternative source at http://m.singtao.com/index.php
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.utils.date import now as nowf
|
from calibre.utils.date import now as nowf
|
||||||
import os, datetime, re
|
import os, datetime, re
|
||||||
from datetime import date
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
@ -41,7 +41,7 @@ class STHKRecipe(BasicNewsRecipe):
|
|||||||
title = 'Sing Tao Daily - Hong Kong'
|
title = 'Sing Tao Daily - Hong Kong'
|
||||||
description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
|
description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:150%;}'
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'
|
||||||
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
|
masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
|
||||||
if __Source__ == 'normal':
|
if __Source__ == 'normal':
|
||||||
keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
|
keep_only_tags = [dict(name='td', attrs={'class':['bodyhead','bodytext']})]
|
||||||
@ -96,17 +96,13 @@ class STHKRecipe(BasicNewsRecipe):
|
|||||||
return self.get_dtlocal().strftime("%d")
|
return self.get_dtlocal().strftime("%d")
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
#cover = 'http://singtao.com/media/a/a(2660).jpg' # for 2011/12/29
|
soup = self.index_to_soup('http://m.singtao.com/')
|
||||||
base = 2660
|
cover = soup.find(attrs={'class':'special'}).get('src', False)
|
||||||
todaydate = date(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()))
|
|
||||||
diff = todaydate - date(2011, 12, 29)
|
|
||||||
base = base + int(diff.total_seconds()/(3600*24))
|
|
||||||
cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg'
|
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
try:
|
try:
|
||||||
br.open(cover)
|
br.open(cover)
|
||||||
except:
|
except:
|
||||||
cover = 'http://singtao.com/images/stlogo.gif'
|
cover = None
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
@ -289,11 +285,11 @@ class STHKRecipe(BasicNewsRecipe):
|
|||||||
# the text may or may not be enclosed in <p></p> tag
|
# the text may or may not be enclosed in <p></p> tag
|
||||||
paras = articlebody.findAll('p')
|
paras = articlebody.findAll('p')
|
||||||
if not paras:
|
if not paras:
|
||||||
paras = articlebody
|
paras = articlebody
|
||||||
textFound = False
|
textFound = False
|
||||||
for p in paras:
|
for p in paras:
|
||||||
if not textFound:
|
if not textFound:
|
||||||
summary_candidate = self.tag_to_string(p).strip()
|
summary_candidate = self.tag_to_string(p).strip().replace(' ', '')
|
||||||
if len(summary_candidate) > 0:
|
if len(summary_candidate) > 0:
|
||||||
summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
|
summary_candidate = summary_candidate.replace(u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
|
||||||
article.summary = article.text_summary = summary_candidate
|
article.summary = article.text_summary = summary_candidate
|
||||||
@ -489,3 +485,4 @@ class STHKRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
71
recipes/sport_pl.recipe
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = 'teepel 2012'
|
||||||
|
|
||||||
|
'''
|
||||||
|
sport.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class sport_pl(BasicNewsRecipe):
|
||||||
|
title = 'Sport.pl'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'Największy portal sportowy w Polsce. Wiadomości sportowe z najważniejszych wydarzeń, relacje i wyniki meczów na żywo.'
|
||||||
|
masthead_url='http://press.gazeta.pl/file/mediakit/154509/c8/sportpl.jpg'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'a', attrs = {'href' : 'www.gazeta.pl'}))
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Wszystkie wiadomości', u'http://rss.gazeta.pl/pub/rss/sport.xml'),
|
||||||
|
(u'Piłka nożna', u'http://www.sport.pl/pub/rss/sport/pilka_nozna.htm'),
|
||||||
|
(u'F1', u'http://www.sport.pl/pub/rss/sportf1.htm'),
|
||||||
|
(u'Tenis', u'http://serwisy.gazeta.pl/pub/rss/tenis.htm'),
|
||||||
|
(u'Siatkówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611628/index.rss'),
|
||||||
|
(u'Koszykówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611647/index.rss'),
|
||||||
|
(u'Piłka ręczna', u'http://gazeta.pl.feedsportal.com/c/32739/f/611635/index.rss'),
|
||||||
|
(u'Inne sporty', u'http://gazeta.pl.feedsportal.com/c/32739/f/611649/index.rss'),
|
||||||
|
]
|
||||||
|
def parse_feeds(self):
|
||||||
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
|
for feed in feeds:
|
||||||
|
for article in feed.articles[:]:
|
||||||
|
if '[ZDJĘCIA]' in article.title:
|
||||||
|
article.title = article.title.replace('[ZDJĘCIA]','')
|
||||||
|
elif '[WIDEO]' in article.title:
|
||||||
|
article.title = article.title.replace('[WIDEO]','')
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
if 'feedsportal' in url:
|
||||||
|
segment = url.split('/')
|
||||||
|
urlPart = segment[-2]
|
||||||
|
urlPart = urlPart.replace('0L0Ssport0Bpl0C','')
|
||||||
|
urlPart = urlPart.replace('0C10H','/')
|
||||||
|
urlPart = urlPart.replace('0H',',')
|
||||||
|
urlPart = urlPart.replace('0I','_')
|
||||||
|
urlPart = urlPart.replace('A','')
|
||||||
|
segment1 = urlPart.split('/')
|
||||||
|
seg1 = segment1[0]
|
||||||
|
seg2 = segment1[1]
|
||||||
|
segment2 = seg2.split(',')
|
||||||
|
part = segment2[0] + ',' + segment2[1]
|
||||||
|
return 'http://www.sport.pl/' + seg1 + '/2029020,' + part + '.html'
|
||||||
|
else:
|
||||||
|
segment = url.split('/')
|
||||||
|
part2 = segment[-2]
|
||||||
|
part1 = segment[-1]
|
||||||
|
segment2 = part1.split(',')
|
||||||
|
part = segment2[1] + ',' + segment2[2]
|
||||||
|
return 'http://www.sport.pl/' + part2 + '/2029020,' + part + '.html'
|
70
recipes/sportowefakty.recipe
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.utils.magick import Image
|
||||||
|
|
||||||
|
class sportowefakty(BasicNewsRecipe):
|
||||||
|
title = u'SportoweFakty'
|
||||||
|
__author__ = 'Artur Stachecki <artur.stachecki@gmail.com>, Tomasz Długosz <tomek3d@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description = u'Najważniejsze informacje sportowe z kraju i ze świata, relacje, komentarze, wywiady, zdjęcia!'
|
||||||
|
oldest_article = 1
|
||||||
|
masthead_url='http://www.sportowefakty.pl/images/logo.png'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
use_embedded_content=False
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs = {'class' : 'box-article'})]
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(attrs = {'class' : re.compile(r'^newsStream')}))
|
||||||
|
remove_tags.append(dict(attrs = {'target' : '_blank'}))
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Piłka Nożna', u'http://www.sportowefakty.pl/pilka-nozna/index.rss'),
|
||||||
|
(u'Koszykówka', u'http://www.sportowefakty.pl/koszykowka/index.rss'),
|
||||||
|
(u'Żużel', u'http://www.sportowefakty.pl/zuzel/index.rss'),
|
||||||
|
(u'Siatkówka', u'http://www.sportowefakty.pl/siatkowka/index.rss'),
|
||||||
|
(u'Zimowe', u'http://www.sportowefakty.pl/zimowe/index.rss'),
|
||||||
|
(u'Hokej', u'http://www.sportowefakty.pl/hokej/index.rss'),
|
||||||
|
(u'Moto', u'http://www.sportowefakty.pl/moto/index.rss'),
|
||||||
|
(u'Tenis', u'http://www.sportowefakty.pl/tenis/index.rss')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
link = article.get('link', None)
|
||||||
|
if 'utm_source' in link:
|
||||||
|
return link.split('?utm')[0]
|
||||||
|
else:
|
||||||
|
return link
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
print_url = url + '/drukuj'
|
||||||
|
return print_url
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
head = soup.find('h1')
|
||||||
|
if 'Fotorelacja' in self.tag_to_string(head):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first):
|
||||||
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
|
iurl = tag['src']
|
||||||
|
img = Image()
|
||||||
|
img.open(iurl)
|
||||||
|
if img < 0:
|
||||||
|
raise RuntimeError('Out of memory')
|
||||||
|
img.type = "GrayscaleType"
|
||||||
|
img.save(iurl)
|
||||||
|
return soup
|
@ -1,18 +1,20 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
class Tablety_pl(BasicNewsRecipe):
|
class Tablety_pl(BasicNewsRecipe):
|
||||||
title = u'Tablety.pl'
|
title = u'Tablety.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Tablety, gry i aplikacje na tablety.'
|
description = u'Tablety, gry i aplikacje na tablety.'
|
||||||
masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
||||||
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
||||||
category = 'IT'
|
category = 'IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
use_embedded_content=True
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
|
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
|
||||||
|
keep_only_tags = [dict(id='news_block')]
|
||||||
#remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
|
#remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
|
||||||
#remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'})
|
#remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'})
|
||||||
#remove_tags=[dict(name='footer', attrs={'class':'entry-footer clearfix'}), dict(name='div', attrs={'class':'entry-comment-counter'})]
|
remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments']})]
|
||||||
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]
|
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]
|
@ -20,7 +20,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
ignore_duplicate_articles = {'title','url'}
|
ignore_duplicate_articles = {'title','url'}
|
||||||
|
compress_news_images = True
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
theonion.com
|
theonion.com
|
||||||
@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class TheOnion(BasicNewsRecipe):
|
class TheOnion(BasicNewsRecipe):
|
||||||
title = 'The Onion'
|
title = 'The Onion'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = "America's finest news source"
|
description = "The Onion, America's Finest News Source, is an award-winning publication covering world, national, and * local issues. It is updated daily online and distributed weekly in select American cities."
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
publisher = 'Onion, Inc.'
|
publisher = 'Onion, Inc.'
|
||||||
@ -20,7 +20,8 @@ class TheOnion(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
publication_type = 'newsportal'
|
publication_type = 'newsportal'
|
||||||
masthead_url = 'http://o.onionstatic.com/img/headers/onion_190.png'
|
needs_subscription = 'optional'
|
||||||
|
masthead_url = 'http://www.theonion.com/static/onion/img/logo_1x.png'
|
||||||
extra_css = """
|
extra_css = """
|
||||||
body{font-family: Helvetica,Arial,sans-serif}
|
body{font-family: Helvetica,Arial,sans-serif}
|
||||||
.section_title{color: gray; text-transform: uppercase}
|
.section_title{color: gray; text-transform: uppercase}
|
||||||
@ -37,18 +38,12 @@ class TheOnion(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [dict(attrs={'class':'full-article'})]
|
||||||
dict(name='h2', attrs={'class':['section_title','title']})
|
remove_attributes = ['lang','rel']
|
||||||
,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
|
remove_tags = [
|
||||||
,dict(attrs={'id':['entries']})
|
dict(name=['object','link','iframe','base','meta'])
|
||||||
]
|
,dict(attrs={'class':lambda x: x and 'share-tools' in x.split()})
|
||||||
remove_attributes=['lang','rel']
|
]
|
||||||
remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
|
|
||||||
remove_tags = [
|
|
||||||
dict(name=['object','link','iframe','base','meta'])
|
|
||||||
,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
|
|
||||||
,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -56,6 +51,17 @@ class TheOnion(BasicNewsRecipe):
|
|||||||
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
|
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
|
br.open('http://www.theonion.com/')
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('https://ui.ppjol.com/login/onion/u/j_spring_security_check')
|
||||||
|
br.select_form(name='f')
|
||||||
|
br['j_username'] = self.username
|
||||||
|
br['j_password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
artl = BasicNewsRecipe.get_article_url(self, article)
|
artl = BasicNewsRecipe.get_article_url(self, article)
|
||||||
if artl.startswith('http://www.theonion.com/audio/'):
|
if artl.startswith('http://www.theonion.com/audio/'):
|
||||||
@ -79,4 +85,8 @@ class TheOnion(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
|
for item in soup.findAll('img'):
|
||||||
|
if item.has_key('data-src'):
|
||||||
|
item['src'] = item['data-src']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
tomshardware.com/us
|
tomshardware.com/us
|
||||||
'''
|
'''
|
||||||
@ -16,22 +14,20 @@ class Tomshardware(BasicNewsRecipe):
|
|||||||
publisher = "Tom's Hardware"
|
publisher = "Tom's Hardware"
|
||||||
category = 'news, IT, hardware, USA'
|
category = 'news, IT, hardware, USA'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
needs_subscription = True
|
needs_subscription = 'optional'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
INDEX = 'http://www.tomshardware.com'
|
INDEX = 'http://www.tomshardware.com'
|
||||||
LOGIN = INDEX + '/membres/'
|
LOGIN = INDEX + '/membres/'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content= False
|
use_embedded_content= False
|
||||||
|
|
||||||
html2lrf_options = [
|
conversion_options = {
|
||||||
'--comment', description
|
'comment' : description
|
||||||
, '--category', category
|
, 'tags' : category
|
||||||
, '--publisher', publisher
|
, 'publisher' : publisher
|
||||||
]
|
, 'language' : language
|
||||||
|
}
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
br.open(self.INDEX+'/us/')
|
br.open(self.INDEX+'/us/')
|
||||||
@ -50,8 +46,8 @@ class Tomshardware(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Latest Articles', u'http://www.tomshardware.com/feeds/atom/tom-s-hardware-us,18-2.xml' )
|
(u'Reviews', u'http://www.tomshardware.com/feeds/rss2/tom-s-hardware-us,18-2.xml')
|
||||||
,(u'Latest News' , u'http://www.tomshardware.com/feeds/atom/tom-s-hardware-us,18-1.xml')
|
,(u'News' , u'http://www.tomshardware.com/feeds/rss2/tom-s-hardware-us,18-1.xml')
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
|
17
recipes/universe_today.recipe
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class UniverseToday(BasicNewsRecipe):
|
||||||
|
title = u'Universe Today'
|
||||||
|
language = 'en'
|
||||||
|
description = u'Space and astronomy news.'
|
||||||
|
__author__ = 'seird'
|
||||||
|
publisher = u'universetoday.com'
|
||||||
|
category = 'science, astronomy, news, rss'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 40
|
||||||
|
auto_cleanup = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
feeds = [(u'Universe Today', u'http://feeds.feedburner.com/universetoday/pYdq')]
|
@ -6,17 +6,62 @@ __license__ = 'GPL v3'
|
|||||||
www.canada.com
|
www.canada.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class TimesColonist(BasicNewsRecipe):
|
class TimesColonist(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# Customization -- remove sections you don't want.
|
||||||
|
# If your e-reader is an e-ink Kindle and your output profile is
|
||||||
|
# set properly this recipe will not include images because the
|
||||||
|
# resulting file is too large. If you have one of these and want
|
||||||
|
# images you can set kindle_omit_images = False
|
||||||
|
# and remove sections (typically the e-ink Kindles will
|
||||||
|
# work with about a dozen of these, but your mileage may vary).
|
||||||
|
|
||||||
|
kindle_omit_images = True
|
||||||
|
|
||||||
|
section_list = [
|
||||||
|
('','Web Front Page'),
|
||||||
|
('news/','News Headlines'),
|
||||||
|
('news/b-c/','BC News'),
|
||||||
|
('news/national/','National News'),
|
||||||
|
('news/world/','World News'),
|
||||||
|
('opinion/','Opinion'),
|
||||||
|
('opinion/letters/','Letters'),
|
||||||
|
('business/','Business'),
|
||||||
|
('business/money/','Money'),
|
||||||
|
('business/technology/','Technology'),
|
||||||
|
('business/working/','Working'),
|
||||||
|
('sports/','Sports'),
|
||||||
|
('sports/hockey/','Hockey'),
|
||||||
|
('sports/football/','Football'),
|
||||||
|
('sports/basketball/','Basketball'),
|
||||||
|
('sports/golf/','Golf'),
|
||||||
|
('entertainment/','entertainment'),
|
||||||
|
('entertainment/go/','Go!'),
|
||||||
|
('entertainment/music/','Music'),
|
||||||
|
('entertainment/books/','Books'),
|
||||||
|
('entertainment/Movies/','Movies'),
|
||||||
|
('entertainment/television/','Television'),
|
||||||
|
('life/','Life'),
|
||||||
|
('life/health/','Health'),
|
||||||
|
('life/travel/','Travel'),
|
||||||
|
('life/driving/','Driving'),
|
||||||
|
('life/homes/','Homes'),
|
||||||
|
('life/food-drink/','Food & Drink')
|
||||||
|
]
|
||||||
|
|
||||||
title = u'Victoria Times Colonist'
|
title = u'Victoria Times Colonist'
|
||||||
url_prefix = 'http://www.timescolonist.com'
|
url_prefix = 'http://www.timescolonist.com'
|
||||||
description = u'News from Victoria, BC'
|
description = u'News from Victoria, BC'
|
||||||
fp_tag = 'CAN_TC'
|
fp_tag = 'CAN_TC'
|
||||||
|
|
||||||
|
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
|
||||||
|
|
||||||
|
|
||||||
url_list = []
|
url_list = []
|
||||||
language = 'en_CA'
|
language = 'en_CA'
|
||||||
__author__ = 'Nick Redding'
|
__author__ = 'Nick Redding'
|
||||||
@ -29,15 +74,21 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
.caption { font-size: xx-small; font-style: italic; font-weight: normal; }
|
||||||
'''
|
'''
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]
|
||||||
remove_tags = [{'class':'comments'},
|
|
||||||
{'id':'photocredit'},
|
|
||||||
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('social')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('tools')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('window')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
|
||||||
|
|
||||||
|
def __init__(self, options, log, progress_reporter):
|
||||||
|
self.remove_tags = [{'class':'comments'},
|
||||||
|
{'id':'photocredit'},
|
||||||
|
dict(name='div', attrs={'class':re.compile('top.controls')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('^comments')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('social')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('tools')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('bottom.tools')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('window')}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('related.news.element')})]
|
||||||
|
print("PROFILE NAME = "+options.output_profile.short_name)
|
||||||
|
if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
|
||||||
|
self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
|
||||||
|
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
from datetime import timedelta, date
|
from datetime import timedelta, date
|
||||||
@ -122,7 +173,6 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
def preprocess_html(self,soup):
|
def preprocess_html(self,soup):
|
||||||
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
byline = soup.find('p',attrs={'class':re.compile('ancillary')})
|
||||||
if byline is not None:
|
if byline is not None:
|
||||||
byline.find('a')
|
|
||||||
authstr = self.tag_to_string(byline,False)
|
authstr = self.tag_to_string(byline,False)
|
||||||
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
|
||||||
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
|
||||||
@ -149,9 +199,10 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
atag = htag.a
|
atag = htag.a
|
||||||
if atag is not None:
|
if atag is not None:
|
||||||
url = atag['href']
|
url = atag['href']
|
||||||
#print("Checking "+url)
|
url = url.strip()
|
||||||
if atag['href'].startswith('/'):
|
# print("Checking >>"+url+'<<\n\r')
|
||||||
url = self.url_prefix+atag['href']
|
if url.startswith('/'):
|
||||||
|
url = self.url_prefix+url
|
||||||
if url in self.url_list:
|
if url in self.url_list:
|
||||||
return
|
return
|
||||||
self.url_list.append(url)
|
self.url_list.append(url)
|
||||||
@ -171,10 +222,10 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
if dtag is not None:
|
if dtag is not None:
|
||||||
description = self.tag_to_string(dtag,False)
|
description = self.tag_to_string(dtag,False)
|
||||||
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
|
||||||
#print(sectitle+title+": description = "+description+" URL="+url)
|
print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
|
||||||
|
|
||||||
def add_section_index(self,ans,securl,sectitle):
|
def add_section_index(self,ans,securl,sectitle):
|
||||||
print("Add section url="+self.url_prefix+'/'+securl)
|
print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
|
||||||
try:
|
try:
|
||||||
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
soup = self.index_to_soup(self.url_prefix+'/'+securl)
|
||||||
except:
|
except:
|
||||||
@ -193,33 +244,7 @@ class TimesColonist(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
ans = []
|
ans = []
|
||||||
ans = self.add_section_index(ans,'','Web Front Page')
|
for (url,title) in self.section_list:
|
||||||
ans = self.add_section_index(ans,'news/','News Headlines')
|
ans = self.add_section_index(ans,url,title)
|
||||||
ans = self.add_section_index(ans,'news/b-c/','BC News')
|
|
||||||
ans = self.add_section_index(ans,'news/national/','Natioanl News')
|
|
||||||
ans = self.add_section_index(ans,'news/world/','World News')
|
|
||||||
ans = self.add_section_index(ans,'opinion/','Opinion')
|
|
||||||
ans = self.add_section_index(ans,'opinion/letters/','Letters')
|
|
||||||
ans = self.add_section_index(ans,'business/','Business')
|
|
||||||
ans = self.add_section_index(ans,'business/money/','Money')
|
|
||||||
ans = self.add_section_index(ans,'business/technology/','Technology')
|
|
||||||
ans = self.add_section_index(ans,'business/working/','Working')
|
|
||||||
ans = self.add_section_index(ans,'sports/','Sports')
|
|
||||||
ans = self.add_section_index(ans,'sports/hockey/','Hockey')
|
|
||||||
ans = self.add_section_index(ans,'sports/football/','Football')
|
|
||||||
ans = self.add_section_index(ans,'sports/basketball/','Basketball')
|
|
||||||
ans = self.add_section_index(ans,'sports/golf/','Golf')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/','entertainment')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/go/','Go!')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/music/','Music')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/books/','Books')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/Movies/','movies')
|
|
||||||
ans = self.add_section_index(ans,'entertainment/television/','Television')
|
|
||||||
ans = self.add_section_index(ans,'life/','Life')
|
|
||||||
ans = self.add_section_index(ans,'life/health/','Health')
|
|
||||||
ans = self.add_section_index(ans,'life/travel/','Travel')
|
|
||||||
ans = self.add_section_index(ans,'life/driving/','Driving')
|
|
||||||
ans = self.add_section_index(ans,'life/homes/','Homes')
|
|
||||||
ans = self.add_section_index(ans,'life/food-drink/','Food & Drink')
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|