Merge from trunk
@ -38,6 +38,8 @@ calibre_plugins/
|
|||||||
recipes/.git
|
recipes/.git
|
||||||
recipes/.gitignore
|
recipes/.gitignore
|
||||||
recipes/README.md
|
recipes/README.md
|
||||||
|
recipes/icon_checker.py
|
||||||
|
recipes/readme_updater.py
|
||||||
recipes/katalog_egazeciarz.recipe
|
recipes/katalog_egazeciarz.recipe
|
||||||
recipes/tv_axnscifi.recipe
|
recipes/tv_axnscifi.recipe
|
||||||
recipes/tv_comedycentral.recipe
|
recipes/tv_comedycentral.recipe
|
||||||
@ -60,6 +62,7 @@ recipes/tv_tvpkultura.recipe
|
|||||||
recipes/tv_tvppolonia.recipe
|
recipes/tv_tvppolonia.recipe
|
||||||
recipes/tv_tvpuls.recipe
|
recipes/tv_tvpuls.recipe
|
||||||
recipes/tv_viasathistory.recipe
|
recipes/tv_viasathistory.recipe
|
||||||
|
recipes/icons/katalog_egazeciarz.png
|
||||||
recipes/icons/tv_axnscifi.png
|
recipes/icons/tv_axnscifi.png
|
||||||
recipes/icons/tv_comedycentral.png
|
recipes/icons/tv_comedycentral.png
|
||||||
recipes/icons/tv_discoveryscience.png
|
recipes/icons/tv_discoveryscience.png
|
||||||
|
100
Changelog.yaml
@ -1,3 +1,4 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=2:sw=2:sta:et:sts=2:ai
|
||||||
# Each release can have new features and bug fixes. Each of which
|
# Each release can have new features and bug fixes. Each of which
|
||||||
# must have a title and can optionally have linked tickets and a description.
|
# must have a title and can optionally have linked tickets and a description.
|
||||||
# In addition they can have a type field which defaults to minor, but should be major
|
# In addition they can have a type field which defaults to minor, but should be major
|
||||||
@ -19,6 +20,105 @@
|
|||||||
# new recipes:
|
# new recipes:
|
||||||
# - title:
|
# - title:
|
||||||
|
|
||||||
|
- version: 0.9.25
|
||||||
|
date: 2013-03-29
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Automatic adding: When checking for duplicates is enabled, use the same duplicates found dialog as is used during manual adding."
|
||||||
|
tickets: [1160914]
|
||||||
|
|
||||||
|
- title: "ToC Editor: Allow searching to find a location quickly when browsing through the book to select a location for a ToC item"
|
||||||
|
|
||||||
|
- title: "ToC Editor: Add a button to quickly flatten the entire table of contents"
|
||||||
|
|
||||||
|
- title: "Conversion: When converting a single book to EPUB or AZW3, add an option to automatically launch the Table of Contents editor after the conversion completes. Found under the Table of Contents section of the conversion dialog."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "calibredb: Nicer error messages when user provides invalid input"
|
||||||
|
tickets: [1160452,1160631]
|
||||||
|
|
||||||
|
- title: "News download: Always use the .jpg extension for jpeg images as apparently Moon+ Reader cannot handle .jpeg"
|
||||||
|
|
||||||
|
- title: "Fix Book Details popup keyboard navigation doesn't work on a Mac"
|
||||||
|
tickets: [1159610]
|
||||||
|
|
||||||
|
- title: "Fix a regression that caused the case of the book files to not be changed when changing the case of the title/author on case insensitive filesystems"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- RTE news
|
||||||
|
- Various Polish news sources
|
||||||
|
- Psychology Today
|
||||||
|
- Foreign Affairs
|
||||||
|
- History Today
|
||||||
|
- Harpers Magazine (printed edition)
|
||||||
|
- Business Week Magazine
|
||||||
|
- The Hindu
|
||||||
|
- Irish Times
|
||||||
|
- Le Devoir
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Fortune Magazine
|
||||||
|
author: Rick Shang
|
||||||
|
|
||||||
|
- title: Eclipse Online
|
||||||
|
author: Jim DeVona
|
||||||
|
|
||||||
|
- version: 0.9.24
|
||||||
|
date: 2013-03-22
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "ToC Editor: Allow auto-generation of Table of Contents entries from headings and/or links in the book"
|
||||||
|
|
||||||
|
- title: "EPUB/MOBI Catalogs: Allow saving used settings as presets which can be loaded easily later."
|
||||||
|
tickets: [1155587]
|
||||||
|
|
||||||
|
- title: "Indicate which columns are custom columns when selecting columns in the Preferences"
|
||||||
|
tickets: [1158066]
|
||||||
|
|
||||||
|
- title: "News download: Add an option recipe authors can set to have calibre automatically reduce the size of downloaded images by lowering their quality"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "News download: Fix a regression in 0.9.23 that prevented oldest_article from working with some RSS feeds."
|
||||||
|
|
||||||
|
- title: "Conversion: handle the :before and :after pseudo CSS selectors correctly"
|
||||||
|
|
||||||
|
- title: "AZW3 Output: Handle the case of the <guide> reference to a ToC containing an anchor correctly."
|
||||||
|
tickets: [1158413]
|
||||||
|
|
||||||
|
- title: "BiBTeX catalogs: Fix ISBN not being output and the library_name field causing catalog generation to fail"
|
||||||
|
tickets: [1156432, 1158127]
|
||||||
|
|
||||||
|
- title: "Conversion: Add support for CSS stylesheets that wrap their rules inside a @media rule."
|
||||||
|
tickets: [1157345]
|
||||||
|
|
||||||
|
- title: "Cover browser: Fix scrolling not working for books after the 32678'th book in a large library."
|
||||||
|
tickets: [1153204]
|
||||||
|
|
||||||
|
- title: "Linux: Update bundled libmtp version"
|
||||||
|
|
||||||
|
- title: "Clear the Book details panel when the current search returns no matches."
|
||||||
|
tickets: [1153026]
|
||||||
|
|
||||||
|
- title: "Fix a regression that broke creation of advanced column coloring rules"
|
||||||
|
tickets: [1156291]
|
||||||
|
|
||||||
|
- title: "Amazon metadata download: Handle cover images loaded via javascript on the amazon.de site"
|
||||||
|
|
||||||
|
- title: "Nicer error message when exporting a generated csv catalog to a file open in another program on windows."
|
||||||
|
tickets: [1155539]
|
||||||
|
|
||||||
|
- title: "Fix ebook-convert -h showing ANSI escape codes in the windows command prompt"
|
||||||
|
tickets: [1158499]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Various Polish news sources
|
||||||
|
- kath.net
|
||||||
|
- Il Giornale
|
||||||
|
- Kellog Insight
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title:
|
||||||
|
|
||||||
- version: 0.9.23
|
- version: 0.9.23
|
||||||
date: 2013-03-15
|
date: 2013-03-15
|
||||||
|
|
||||||
|
@ -434,6 +434,18 @@ a number of older formats either do not support a metadata based Table of Conten
|
|||||||
documents do not have one. In these cases, the options in this section can help you automatically
|
documents do not have one. In these cases, the options in this section can help you automatically
|
||||||
generate a Table of Contents in the converted ebook, based on the actual content in the input document.
|
generate a Table of Contents in the converted ebook, based on the actual content in the input document.
|
||||||
|
|
||||||
|
.. note:: Using these options can be a little challenging to get exactly right.
|
||||||
|
If you prefer creating/editing the Table of Contents by hand, convert to
|
||||||
|
the EPUB or AZW3 formats and select the checkbox at the bottom of the
|
||||||
|
screen that says
|
||||||
|
:guilabel:`Manually fine-tune the Table of Contents after conversion`.
|
||||||
|
This will launch the ToC Editor tool after the conversion. It allows you to
|
||||||
|
create entries in the Table of Contents by simply clicking the place in the
|
||||||
|
book where you want the entry to point. You can also use the ToC Editor by
|
||||||
|
itself, without doing a conversion. Go to :guilabel:`Preferences->Toolbars`
|
||||||
|
and add the ToC Editor to the main toolbar. Then just select the book you
|
||||||
|
want to edit and click the ToC Editor button.
|
||||||
|
|
||||||
The first option is :guilabel:`Force use of auto-generated Table of Contents`. By checking this option
|
The first option is :guilabel:`Force use of auto-generated Table of Contents`. By checking this option
|
||||||
you can have |app| override any Table of Contents found in the metadata of the input document with the
|
you can have |app| override any Table of Contents found in the metadata of the input document with the
|
||||||
auto generated one.
|
auto generated one.
|
||||||
@ -456,7 +468,7 @@ For example, to remove all entries titles "Next" or "Previous" use::
|
|||||||
|
|
||||||
Next|Previous
|
Next|Previous
|
||||||
|
|
||||||
Finally, the :guilabel:`Level 1,2,3 TOC` options allow you to create a sophisticated multi-level Table of Contents.
|
The :guilabel:`Level 1,2,3 TOC` options allow you to create a sophisticated multi-level Table of Contents.
|
||||||
They are XPath expressions that match tags in the intermediate XHTML produced by the conversion pipeline. See the
|
They are XPath expressions that match tags in the intermediate XHTML produced by the conversion pipeline. See the
|
||||||
:ref:`conversion-introduction` for how to get access to this XHTML. Also read the :ref:`xpath-tutorial`, to learn
|
:ref:`conversion-introduction` for how to get access to this XHTML. Also read the :ref:`xpath-tutorial`, to learn
|
||||||
how to construct XPath expressions. Next to each option is a button that launches a wizard to help with the creation
|
how to construct XPath expressions. Next to each option is a button that launches a wizard to help with the creation
|
||||||
|
@ -87,7 +87,9 @@ this bug.
|
|||||||
|
|
||||||
How do I convert a collection of HTML files in a specific order?
|
How do I convert a collection of HTML files in a specific order?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
In order to convert a collection of HTML files in a specific oder, you have to create a table of contents file. That is, another HTML file that contains links to all the other files in the desired order. Such a file looks like::
|
In order to convert a collection of HTML files in a specific oder, you have to
|
||||||
|
create a table of contents file. That is, another HTML file that contains links
|
||||||
|
to all the other files in the desired order. Such a file looks like::
|
||||||
|
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
@ -102,19 +104,36 @@ In order to convert a collection of HTML files in a specific oder, you have to c
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
||||||
Then just add this HTML file to the GUI and use the convert button to create your ebook.
|
Then, just add this HTML file to the GUI and use the convert button to create
|
||||||
|
your ebook. You can use the option in the Table of Contents section in the
|
||||||
|
conversion dialog to control how the Table of Contents is generated.
|
||||||
|
|
||||||
.. note:: By default, when adding HTML files, |app| follows links in the files in *depth first* order. This means that if file A.html links to B.html and C.html and D.html, but B.html also links to D.html, then the files will be in the order A.html, B.html, D.html, C.html. If instead you want the order to be A.html, B.html, C.html, D.html then you must tell |app| to add your files in *breadth first* order. Do this by going to Preferences->Plugins and customizing the HTML to ZIP plugin.
|
.. note:: By default, when adding HTML files, |app| follows links in the files
|
||||||
|
in *depth first* order. This means that if file A.html links to B.html and
|
||||||
|
C.html and D.html, but B.html also links to D.html, then the files will be
|
||||||
|
in the order A.html, B.html, D.html, C.html. If instead you want the order
|
||||||
|
to be A.html, B.html, C.html, D.html then you must tell |app| to add your
|
||||||
|
files in *breadth first* order. Do this by going to Preferences->Plugins
|
||||||
|
and customizing the HTML to ZIP plugin.
|
||||||
|
|
||||||
The EPUB I produced with |app| is not valid?
|
The EPUB I produced with |app| is not valid?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|app| does not guarantee that an EPUB produced by it is valid. The only guarantee it makes is that if you feed it valid XHTML 1.1 + CSS 2.1 it will output a valid EPUB. |app| is designed for ebook consumers, not producers. It tries hard to ensure that EPUBs it produces actually work as intended on a wide variety of devices, a goal that is incompatible with producing valid EPUBs, and one that is far more important to the vast majority of its users. If you need a tool that always produces valid EPUBs, |app| is not for you.
|
|app| does not guarantee that an EPUB produced by it is valid. The only
|
||||||
|
guarantee it makes is that if you feed it valid XHTML 1.1 + CSS 2.1 it will
|
||||||
|
output a valid EPUB. |app| is designed for ebook consumers, not producers. It
|
||||||
|
tries hard to ensure that EPUBs it produces actually work as intended on a wide
|
||||||
|
variety of devices, a goal that is incompatible with producing valid EPUBs, and
|
||||||
|
one that is far more important to the vast majority of its users. If you need a
|
||||||
|
tool that always produces valid EPUBs, |app| is not for you.
|
||||||
|
|
||||||
How do I use some of the advanced features of the conversion tools?
|
How do I use some of the advanced features of the conversion tools?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
You can get help on any individual feature of the converters by mousing over it in the GUI or running ``ebook-convert dummy.html .epub -h`` at a terminal. A good place to start is to look at the following demo files that demonstrate some of the advanced features:
|
You can get help on any individual feature of the converters by mousing over
|
||||||
* `html-demo.zip <http://calibre-ebook.com/downloads/html-demo.zip>`_
|
it in the GUI or running ``ebook-convert dummy.html .epub -h`` at a terminal.
|
||||||
|
A good place to start is to look at the following demo file that demonstrates
|
||||||
|
some of the advanced features
|
||||||
|
`html-demo.zip <http://calibre-ebook.com/downloads/html-demo.zip>`_
|
||||||
|
|
||||||
|
|
||||||
Device Integration
|
Device Integration
|
||||||
@ -126,11 +145,11 @@ Device Integration
|
|||||||
|
|
||||||
What devices does |app| support?
|
What devices does |app| support?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|app| can directly connect to all the major (and most of the minor) ebook reading devices,
|
|app| can directly connect to all the major (and most of the minor) ebook
|
||||||
smarthphones, tablets, etc.
|
reading devices, smarthphones, tablets, etc. In addition, using the
|
||||||
In addition, using the :guilabel:`Connect to folder` function you can use it with any ebook reader that exports itself as a USB disk.
|
:guilabel:`Connect to folder` function you can use it with any ebook reader
|
||||||
You can even connect to Apple devices (via iTunes), using the :guilabel:`Connect to iTunes`
|
that exports itself as a USB disk. You can even connect to Apple devices (via
|
||||||
function.
|
iTunes), using the :guilabel:`Connect to iTunes` function.
|
||||||
|
|
||||||
.. _devsupport:
|
.. _devsupport:
|
||||||
|
|
||||||
|
@ -10,46 +10,35 @@ class Adventure_zone(BasicNewsRecipe):
|
|||||||
oldest_article = 20
|
oldest_article = 20
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
|
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
|
||||||
index='http://www.adventure-zone.info/fusion/'
|
index = 'http://www.adventure-zone.info/fusion/'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''),
|
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''),
|
||||||
(re.compile(r'</?table.*?>'), lambda match: ''),
|
(re.compile(r'</?table.*?>'), lambda match: ''),
|
||||||
(re.compile(r'</?tbody.*?>'), lambda match: '')]
|
(re.compile(r'</?tbody.*?>'), lambda match: '')]
|
||||||
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
|
remove_tags_before = dict(name='td', attrs={'class':'main-bg'})
|
||||||
remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
|
remove_tags = [dict(name='img', attrs={'alt':'Drukuj'})]
|
||||||
remove_tags_after= dict(id='comments')
|
remove_tags_after = dict(id='comments')
|
||||||
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
|
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; } img.news-category {float: left; margin-right: 5px;}'
|
||||||
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
|
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
|
||||||
|
|
||||||
'''def parse_feeds (self):
|
|
||||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
|
||||||
soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
|
|
||||||
tag=soup.find(name='channel')
|
|
||||||
titles=[]
|
|
||||||
for r in tag.findAll(name='image'):
|
|
||||||
r.extract()
|
|
||||||
art=tag.findAll(name='item')
|
|
||||||
for i in art:
|
|
||||||
titles.append(i.title.string)
|
|
||||||
for feed in feeds:
|
|
||||||
for article in feed.articles[:]:
|
|
||||||
article.title=titles[feed.articles.index(article)]
|
|
||||||
return feeds'''
|
|
||||||
|
|
||||||
|
|
||||||
'''def get_cover_url(self):
|
'''def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
|
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
|
||||||
cover=soup.find(id='box_OstatninumerAZ')
|
cover=soup.find(id='box_OstatninumerAZ')
|
||||||
self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src']
|
self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src']
|
||||||
return getattr(self, 'cover_url', self.cover_url)'''
|
return getattr(self, 'cover_url', self.cover_url)'''
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
result = re.search('(.+) - Adventure Zone', soup.title.string)
|
result = re.search('(.+) - Adventure Zone', soup.title.string)
|
||||||
if result:
|
if result:
|
||||||
article.title = result.group(1)
|
result = result.group(1)
|
||||||
else:
|
else:
|
||||||
result = soup.body.find('strong')
|
result = soup.body.find('strong')
|
||||||
if result:
|
if result:
|
||||||
article.title = result.string
|
result = result.string
|
||||||
|
if result:
|
||||||
|
result = result.replace('&', '&')
|
||||||
|
result = result.replace(''', '’')
|
||||||
|
article.title = result
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
|
skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
|
||||||
@ -78,4 +67,3 @@ class Adventure_zone(BasicNewsRecipe):
|
|||||||
a['href']=self.index + a['href']
|
a['href']=self.index + a['href']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
54
recipes/arret_sur_images.recipe
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
__license__ = 'WTFPL'
|
||||||
|
__author__ = '2013, François D. <franek at chicour.net>'
|
||||||
|
__description__ = 'Get some fresh news from Arrêt sur images'
|
||||||
|
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Asi(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Arrêt sur images'
|
||||||
|
__author__ = 'François D. (aka franek)'
|
||||||
|
description = 'Global news in french from news site "Arrêt sur images"'
|
||||||
|
|
||||||
|
oldest_article = 7.0
|
||||||
|
language = 'fr'
|
||||||
|
needs_subscription = True
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
simultaneous_downloads = 1
|
||||||
|
timefmt = '[%a, %d %b %Y %I:%M +0200]'
|
||||||
|
cover_url = 'http://www.arretsurimages.net/images/header/menu/menu_1.png'
|
||||||
|
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('vite dit et gratuit', 'http://www.arretsurimages.net/vite-dit.rss'),
|
||||||
|
('Toutes les chroniques', 'http://www.arretsurimages.net/chroniques.rss'),
|
||||||
|
('Contenus et dossiers', 'http://www.arretsurimages.net/dossiers.rss'),
|
||||||
|
]
|
||||||
|
|
||||||
|
conversion_options = { 'smarten_punctuation' : True }
|
||||||
|
|
||||||
|
remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'), dict(name='div', attrs={'class':'bloc-chroniqueur-2'}), dict(id='footercontainer')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('contenu.php', 'contenu-imprimable.php')
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
# Need to use robust HTML parser
|
||||||
|
br = BasicNewsRecipe.get_browser(self, use_robust_parser=True)
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.arretsurimages.net/index.php')
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br.form.set_all_readonly(False)
|
||||||
|
br['redir'] = 'forum/login.php'
|
||||||
|
br['username'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
@ -18,3 +18,10 @@ class Astroflesz(BasicNewsRecipe):
|
|||||||
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
|
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
|
||||||
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]
|
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]
|
||||||
feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')]
|
feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')]
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first_fetch):
|
||||||
|
t = soup.find(attrs={'class':'itemIntroText'})
|
||||||
|
if t:
|
||||||
|
for i in t.findAll('img'):
|
||||||
|
i['style'] = 'float: left; margin-right: 5px;'
|
||||||
|
return soup
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
class BadaniaNet(BasicNewsRecipe):
|
class BadaniaNet(BasicNewsRecipe):
|
||||||
title = u'badania.net'
|
title = u'badania.net'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -6,9 +7,11 @@ class BadaniaNet(BasicNewsRecipe):
|
|||||||
category = 'science'
|
category = 'science'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png'
|
cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png'
|
||||||
|
extra_css = '.alignleft {float:left; margin-right:5px;} .alignright {float:right; margin-left:5px;}'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
preprocess_regexps = [(re.compile(r"<h4>Tekst sponsoruje</h4>", re.IGNORECASE), lambda m: ''),]
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})]
|
remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})]
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class BenchmarkPl(BasicNewsRecipe):
|
class BenchmarkPl(BasicNewsRecipe):
|
||||||
title = u'Benchmark.pl'
|
title = u'Benchmark.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -13,10 +15,10 @@ class BenchmarkPl(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||||
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')]
|
keep_only_tags = [dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')]
|
||||||
remove_tags_after=dict(name='div', attrs={'class':'body'})
|
remove_tags_after = dict(id='article')
|
||||||
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
|
remove_tags = [dict(name='div', attrs={'class':['comments', 'body', 'kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs = {'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
|
||||||
INDEX= 'http://www.benchmark.pl'
|
INDEX = 'http://www.benchmark.pl'
|
||||||
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
|
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
|
||||||
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
|
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
|
||||||
|
|
||||||
@ -27,7 +29,12 @@ class BenchmarkPl(BasicNewsRecipe):
|
|||||||
soup2 = self.index_to_soup(nexturl['href'])
|
soup2 = self.index_to_soup(nexturl['href'])
|
||||||
nexturl = soup2.find(attrs={'class':'next'})
|
nexturl = soup2.find(attrs={'class':'next'})
|
||||||
pagetext = soup2.find(name='div', attrs={'class':'body'})
|
pagetext = soup2.find(name='div', attrs={'class':'body'})
|
||||||
appendtag.find('div', attrs={'class':'k_ster'}).extract()
|
tag = appendtag.find('div', attrs={'class':'k_ster'})
|
||||||
|
if tag:
|
||||||
|
tag.extract()
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
if appendtag.find('div', attrs={'class':'k_ster'}):
|
if appendtag.find('div', attrs={'class':'k_ster'}):
|
||||||
@ -37,40 +44,44 @@ class BenchmarkPl(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def image_article(self, soup, appendtag):
|
def image_article(self, soup, appendtag):
|
||||||
nexturl=soup.find('div', attrs={'class':'preview'})
|
nexturl = soup.find('div', attrs={'class':'preview'})
|
||||||
if nexturl is not None:
|
if nexturl:
|
||||||
nexturl=nexturl.find('a', attrs={'class':'move_next'})
|
nexturl = nexturl.find('a', attrs={'class':'move_next'})
|
||||||
image=appendtag.find('div', attrs={'class':'preview'}).div['style'][16:]
|
image = appendtag.find('div', attrs={'class':'preview'}).div['style'][16:]
|
||||||
image=self.INDEX + image[:image.find("')")]
|
image = self.INDEX + image[:image.find("')")]
|
||||||
appendtag.find(attrs={'class':'preview'}).name='img'
|
appendtag.find(attrs={'class':'preview'}).name='img'
|
||||||
appendtag.find(attrs={'class':'preview'})['src']=image
|
appendtag.find(attrs={'class':'preview'})['src']=image
|
||||||
appendtag.find('a', attrs={'class':'move_next'}).extract()
|
appendtag.find('a', attrs={'class':'move_next'}).extract()
|
||||||
while nexturl is not None:
|
while nexturl:
|
||||||
nexturl= self.INDEX + nexturl['href']
|
nexturl = self.INDEX + nexturl['href']
|
||||||
soup2 = self.index_to_soup(nexturl)
|
soup2 = self.index_to_soup(nexturl)
|
||||||
nexturl=soup2.find('a', attrs={'class':'move_next'})
|
nexturl = soup2.find('a', attrs={'class':'move_next'})
|
||||||
image=soup2.find('div', attrs={'class':'preview'}).div['style'][16:]
|
image = soup2.find('div', attrs={'class':'preview'}).div['style'][16:]
|
||||||
image=self.INDEX + image[:image.find("')")]
|
image = self.INDEX + image[:image.find("')")]
|
||||||
soup2.find(attrs={'class':'preview'}).name='img'
|
soup2.find(attrs={'class':'preview'}).name='img'
|
||||||
soup2.find(attrs={'class':'preview'})['src']=image
|
soup2.find(attrs={'class':'preview'})['src']=image
|
||||||
pagetext=soup2.find('div', attrs={'class':'gallery'})
|
pagetext = soup2.find('div', attrs={'class':'gallery'})
|
||||||
pagetext.find('div', attrs={'class':'title'}).extract()
|
pagetext.find('div', attrs={'class':'title'}).extract()
|
||||||
pagetext.find('div', attrs={'class':'thumb'}).extract()
|
pagetext.find('div', attrs={'class':'thumb'}).extract()
|
||||||
pagetext.find('div', attrs={'class':'panelOcenaObserwowane'}).extract()
|
pagetext.find('div', attrs={'class':'panelOcenaObserwowane'}).extract()
|
||||||
if nexturl is not None:
|
if nexturl:
|
||||||
pagetext.find('a', attrs={'class':'move_next'}).extract()
|
pagetext.find('a', attrs={'class':'move_next'}).extract()
|
||||||
pagetext.find('a', attrs={'class':'move_back'}).extract()
|
pagetext.find('a', attrs={'class':'move_back'}).extract()
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
if soup.find('div', attrs={'class':'preview'}) is not None:
|
if soup.find('div', attrs={'class':'preview'}):
|
||||||
self.image_article(soup, soup.body)
|
self.image_article(soup, soup.body)
|
||||||
else:
|
else:
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and not a['href'].startswith('http'):
|
||||||
a['href']=self.INDEX + a['href']
|
a['href'] = self.INDEX + a['href']
|
||||||
|
for r in soup.findAll(attrs={'class':['comments', 'body']}):
|
||||||
|
r.extract()
|
||||||
return soup
|
return soup
|
||||||
|
@ -14,7 +14,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class biweekly(BasicNewsRecipe):
|
class biweekly(BasicNewsRecipe):
|
||||||
__author__ = u'Łukasz Grąbczewski'
|
__author__ = u'Łukasz Grąbczewski'
|
||||||
title = 'Biweekly'
|
title = 'Biweekly'
|
||||||
language = 'en'
|
language = 'en_PL'
|
||||||
publisher = 'National Audiovisual Institute'
|
publisher = 'National Audiovisual Institute'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
description = u'link with culture [English edition of Polish magazine]: literature, theatre, film, art, music, views, talks'
|
description = u'link with culture [English edition of Polish magazine]: literature, theatre, film, art, music, views, talks'
|
||||||
|
30
recipes/blog_biszopa.recipe
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class BlogBiszopa(BasicNewsRecipe):
|
||||||
|
title = u'Blog Biszopa'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Zapiski z Granitowego Miasta'
|
||||||
|
category = 'history'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
#extra_css = ''
|
||||||
|
cover_url = 'http://blogbiszopa.pl/wp-content/themes/biszop/images/logo.png'
|
||||||
|
masthead_url = ''
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(id='main-content')]
|
||||||
|
remove_tags = [dict(name='footer')]
|
||||||
|
#remove_tags_after = {}
|
||||||
|
#remove_tags_before = {}
|
||||||
|
|
||||||
|
feeds = [(u'Artyku\u0142y', u'http://blogbiszopa.pl/feed/')]
|
||||||
|
|
@ -25,6 +25,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
|
|
||||||
#Find date
|
#Find date
|
||||||
mag=soup.find('h2',text='Magazine')
|
mag=soup.find('h2',text='Magazine')
|
||||||
|
self.log(mag)
|
||||||
dates=self.tag_to_string(mag.findNext('h3'))
|
dates=self.tag_to_string(mag.findNext('h3'))
|
||||||
self.timefmt = u' [%s]'%dates
|
self.timefmt = u' [%s]'%dates
|
||||||
|
|
||||||
@ -32,7 +33,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
div0 = soup.find ('div', attrs={'class':'column left'})
|
div0 = soup.find ('div', attrs={'class':'column left'})
|
||||||
section_title = ''
|
section_title = ''
|
||||||
feeds = OrderedDict()
|
feeds = OrderedDict()
|
||||||
for div in div0.findAll('h4'):
|
for div in div0.findAll(['h4','h5']):
|
||||||
articles = []
|
articles = []
|
||||||
section_title = self.tag_to_string(div.findPrevious('h3')).strip()
|
section_title = self.tag_to_string(div.findPrevious('h3')).strip()
|
||||||
title=self.tag_to_string(div.a).strip()
|
title=self.tag_to_string(div.a).strip()
|
||||||
@ -48,7 +49,7 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
feeds[section_title] += articles
|
feeds[section_title] += articles
|
||||||
div1 = soup.find ('div', attrs={'class':'column center'})
|
div1 = soup.find ('div', attrs={'class':'column center'})
|
||||||
section_title = ''
|
section_title = ''
|
||||||
for div in div1.findAll('h5'):
|
for div in div1.findAll(['h4','h5']):
|
||||||
articles = []
|
articles = []
|
||||||
desc=self.tag_to_string(div.findNext('p')).strip()
|
desc=self.tag_to_string(div.findNext('p')).strip()
|
||||||
section_title = self.tag_to_string(div.findPrevious('h3')).strip()
|
section_title = self.tag_to_string(div.findPrevious('h3')).strip()
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
|
|
||||||
class Ciekawostki_Historyczne(BasicNewsRecipe):
|
class Ciekawostki_Historyczne(BasicNewsRecipe):
|
||||||
title = u'Ciekawostki Historyczne'
|
title = u'Ciekawostki Historyczne'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -7,42 +8,31 @@ class Ciekawostki_Historyczne(BasicNewsRecipe):
|
|||||||
description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.'
|
description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.'
|
||||||
category = 'history'
|
category = 'history'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
|
masthead_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
|
||||||
cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
|
cover_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
extra_css = 'img.alignleft {float:left; margin-right:5px;} .alignright {float:right; margin-left:5px;}'
|
||||||
|
oldest_article = 12
|
||||||
preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')]
|
preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')]
|
||||||
no_stylesheets=True
|
no_stylesheets = True
|
||||||
remove_empty_feeds=True
|
remove_empty_feeds = True
|
||||||
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
|
||||||
remove_tags=[dict(id='singlepostinfo')]
|
recursions = 5
|
||||||
|
remove_tags = [dict(id='singlepostinfo')]
|
||||||
|
|
||||||
feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')]
|
feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')]
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
def is_link_wanted(self, url, tag):
|
||||||
tag=soup.find(name='h7')
|
return 'ciekawostkihistoryczne' in url and url[-2] in {'2', '3', '4', '5', '6'}
|
||||||
if tag:
|
|
||||||
if tag.br:
|
|
||||||
pass
|
|
||||||
elif tag.nextSibling.name=='p':
|
|
||||||
tag=tag.nextSibling
|
|
||||||
nexturl = tag.findAll('a')
|
|
||||||
for nextpage in nexturl:
|
|
||||||
tag.extract()
|
|
||||||
nextpage= nextpage['href']
|
|
||||||
soup2 = self.index_to_soup(nextpage)
|
|
||||||
pagetext = soup2.find(name='div', attrs={'class':'post'})
|
|
||||||
for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}):
|
|
||||||
r.extract()
|
|
||||||
for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}):
|
|
||||||
r.extract()
|
|
||||||
for r in pagetext.findAll('h1'):
|
|
||||||
r.extract()
|
|
||||||
pagetext.find('h6').nextSibling.extract()
|
|
||||||
pagetext.find('h7').nextSibling.extract()
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def postprocess_html(self, soup, first_fetch):
|
||||||
self.append_page(soup, soup.body)
|
tag = soup.find('h7')
|
||||||
|
if tag:
|
||||||
|
tag.nextSibling.extract()
|
||||||
|
if not first_fetch:
|
||||||
|
for r in soup.findAll(['h1']):
|
||||||
|
r.extract()
|
||||||
|
soup.find('h6').nextSibling.extract()
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
class Computerworld_pl(BasicNewsRecipe):
|
class Computerworld_pl(BasicNewsRecipe):
|
||||||
title = u'Computerworld.pl'
|
title = u'Computerworld.pl'
|
||||||
@ -12,8 +12,16 @@ class Computerworld_pl(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
keep_only_tags = [dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})]
|
remove_attributes = ['style',]
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'rMobi'})
|
preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), (re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''),]
|
||||||
remove_tags = [dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
|
keep_only_tags = [dict(id=['szpaltaL', 's2011'])]
|
||||||
|
remove_tags_after = dict(name='div', attrs={'class':'tresc'})
|
||||||
|
remove_tags = [dict(attrs={'class':['nnav', 'rMobi', 'tagi', 'rec']}),]
|
||||||
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
|
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
|
||||||
|
|
||||||
|
def skip_ad_pages(self, soup):
|
||||||
|
if soup.title.string.lower() == 'advertisement':
|
||||||
|
tag = soup.find(name='a')
|
||||||
|
if tag:
|
||||||
|
new_soup = self.index_to_soup(tag['href'], raw=True)
|
||||||
|
return new_soup
|
@ -1,5 +1,6 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment
|
||||||
|
|
||||||
class CoNowegoPl(BasicNewsRecipe):
|
class CoNowegoPl(BasicNewsRecipe):
|
||||||
title = u'conowego.pl'
|
title = u'conowego.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -10,6 +11,7 @@ class CoNowegoPl(BasicNewsRecipe):
|
|||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
INDEX = 'http://www.conowego.pl/'
|
INDEX = 'http://www.conowego.pl/'
|
||||||
|
extra_css = '.news-single-img {float:left; margin-right:5px;}'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
@ -35,6 +37,9 @@ class CoNowegoPl(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}):
|
for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
|
||||||
|
@ -12,11 +12,13 @@ class CzasGentlemanow(BasicNewsRecipe):
|
|||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
extra_css = '.gallery-item {float:left; margin-right: 10px; max-width: 20%;} .alignright {text-align: right; float:right; margin-left:5px;}\
|
||||||
|
.wp-caption-text {text-align: left;} img.aligncenter {display: block; margin-left: auto; margin-right: auto;} .alignleft {float: left; margin-right:5px;}'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
preprocess_regexps = [(re.compile(u'<h3>Może Cię też zainteresować:</h3>'), lambda m: '')]
|
preprocess_regexps = [(re.compile(u'<h3>Może Cię też zainteresować:</h3>'), lambda m: '')]
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'content'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'content'})]
|
||||||
remove_tags = [dict(attrs={'class':'meta_comments'}), dict(id=['comments', 'related_posts_thumbnails'])]
|
remove_tags = [dict(attrs={'class':'meta_comments'}), dict(id=['comments', 'related_posts_thumbnails', 'respond'])]
|
||||||
remove_tags_after = dict(id='comments')
|
remove_tags_after = dict(id='comments')
|
||||||
feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')]
|
feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')]
|
||||||
|
@ -16,6 +16,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
|||||||
extra_css = '.title {font-size:22px;}'
|
extra_css = '.title {font-size:22px;}'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
remove_attrs = ['style', 'width', 'height']
|
||||||
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
|
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
|
||||||
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
|
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
|
||||||
remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')]
|
remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')]
|
||||||
@ -28,4 +29,11 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
|||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href']=self.index + a['href']
|
a['href']=self.index + a['href']
|
||||||
|
for r in soup.findAll('iframe'):
|
||||||
|
r.parent.extract()
|
||||||
|
return soup
|
||||||
|
def postprocess_html(self, soup, first_fetch):
|
||||||
|
for r in soup.findAll('span', text=''):
|
||||||
|
if not r.string:
|
||||||
|
r.extract()
|
||||||
return soup
|
return soup
|
@ -8,6 +8,7 @@ class BasicUserRecipe1337668045(BasicNewsRecipe):
|
|||||||
cover_url = 'http://drytooling.com.pl/images/drytooling-kindle.png'
|
cover_url = 'http://drytooling.com.pl/images/drytooling-kindle.png'
|
||||||
description = u'Drytooling.com.pl jest serwisem wspinaczki zimowej, alpinizmu i himalaizmu. Jeśli uwielbiasz zimę, nie możesz doczekać się aż wyciągniesz szpej z szafki i uderzysz w Tatry, Alpy, czy może Himalaje, to znajdziesz tutaj naprawdę dużo interesujących Cię treści! Zapraszamy!'
|
description = u'Drytooling.com.pl jest serwisem wspinaczki zimowej, alpinizmu i himalaizmu. Jeśli uwielbiasz zimę, nie możesz doczekać się aż wyciągniesz szpej z szafki i uderzysz w Tatry, Alpy, czy może Himalaje, to znajdziesz tutaj naprawdę dużo interesujących Cię treści! Zapraszamy!'
|
||||||
__author__ = u'Damian Granowski'
|
__author__ = u'Damian Granowski'
|
||||||
|
language = 'pl'
|
||||||
oldest_article = 100
|
oldest_article = 100
|
||||||
max_articles_per_feed = 20
|
max_articles_per_feed = 20
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class Dzieje(BasicNewsRecipe):
|
class Dzieje(BasicNewsRecipe):
|
||||||
title = u'dzieje.pl'
|
title = u'dzieje.pl'
|
||||||
@ -8,11 +9,12 @@ class Dzieje(BasicNewsRecipe):
|
|||||||
category = 'history'
|
category = 'history'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
extra_css = '.imagecache-default {float:left; margin-right:20px;}'
|
||||||
index = 'http://dzieje.pl'
|
index = 'http://dzieje.pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript=True
|
remove_javascript = True
|
||||||
no_stylesheets= True
|
no_stylesheets = True
|
||||||
keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')]
|
keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')]
|
||||||
remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')]
|
remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')]
|
||||||
#feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
|
#feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
|
||||||
@ -28,16 +30,19 @@ class Dzieje(BasicNewsRecipe):
|
|||||||
pagetext = soup2.find(id='content-area').find(attrs={'class':'content'})
|
pagetext = soup2.find(id='content-area').find(attrs={'class':'content'})
|
||||||
for r in pagetext.findAll(attrs={'class':['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}):
|
for r in pagetext.findAll(attrs={'class':['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}):
|
||||||
r.extract()
|
r.extract()
|
||||||
pos = len(appendtag.contents)
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
appendtag.insert(pos, pagetext)
|
# appendtag.insert(pos, pagetext)
|
||||||
tag = soup2.find('li', attrs={'class':'pager-next'})
|
tag = soup2.find('li', attrs={'class':'pager-next'})
|
||||||
for r in appendtag.findAll(attrs={'class':['item-list', 'field field-type-computed field-field-tagi', ]}):
|
for r in appendtag.findAll(attrs={'class':['item-list', 'field field-type-computed field-field-tagi', ]}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def find_articles(self, url):
|
def find_articles(self, url):
|
||||||
articles = []
|
articles = []
|
||||||
soup=self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
tag=soup.find(id='content-area').div.div
|
tag = soup.find(id='content-area').div.div
|
||||||
for i in tag.findAll('div', recursive=False):
|
for i in tag.findAll('div', recursive=False):
|
||||||
temp = i.find(attrs={'class':'views-field-title'}).span.a
|
temp = i.find(attrs={'class':'views-field-title'}).span.a
|
||||||
title = temp.string
|
title = temp.string
|
||||||
@ -64,7 +69,7 @@ class Dzieje(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and not a['href'].startswith('http'):
|
||||||
a['href']=self.index + a['href']
|
a['href'] = self.index + a['href']
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class Dziennik_pl(BasicNewsRecipe):
|
class Dziennik_pl(BasicNewsRecipe):
|
||||||
title = u'Dziennik.pl'
|
title = u'Dziennik.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -9,17 +11,17 @@ class Dziennik_pl(BasicNewsRecipe):
|
|||||||
category = 'newspaper'
|
category = 'newspaper'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
masthead_url= 'http://5.s.dziennik.pl/images/logos.png'
|
masthead_url= 'http://5.s.dziennik.pl/images/logos.png'
|
||||||
cover_url= 'http://5.s.dziennik.pl/images/logos.png'
|
cover_url = 'http://5.s.dziennik.pl/images/logos.png'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript=True
|
remove_javascript = True
|
||||||
remove_empty_feeds=True
|
remove_empty_feeds = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
|
extra_css = 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
|
||||||
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
||||||
keep_only_tags=[dict(id='article')]
|
keep_only_tags = [dict(id='article')]
|
||||||
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})]
|
remove_tags = [dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})]
|
||||||
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
||||||
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
||||||
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
||||||
@ -34,26 +36,29 @@ class Dziennik_pl(BasicNewsRecipe):
|
|||||||
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'})
|
tag = soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'})
|
||||||
if tag:
|
if tag:
|
||||||
new_soup=self.index_to_soup(tag['href'], raw=True)
|
new_soup = self.index_to_soup(tag['href'], raw=True)
|
||||||
return new_soup
|
return new_soup
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
def append_page(self, soup, appendtag):
|
||||||
tag=soup.find('a', attrs={'class':'page_next'})
|
tag = soup.find('a', attrs={'class':'page_next'})
|
||||||
if tag:
|
if tag:
|
||||||
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
|
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
|
||||||
while tag:
|
while tag:
|
||||||
soup2= self.index_to_soup(tag['href'])
|
soup2 = self.index_to_soup(tag['href'])
|
||||||
tag=soup2.find('a', attrs={'class':'page_next'})
|
tag = soup2.find('a', attrs={'class':'page_next'})
|
||||||
if not tag:
|
if not tag:
|
||||||
for r in appendtag.findAll('div', attrs={'class':'art_src'}):
|
for r in appendtag.findAll('div', attrs={'class':'art_src'}):
|
||||||
r.extract()
|
r.extract()
|
||||||
pagetext = soup2.find(name='div', attrs={'class':'article_body'})
|
pagetext = soup2.find(name='div', attrs={'class':'article_body'})
|
||||||
for dictionary in self.remove_tags:
|
for dictionary in self.remove_tags:
|
||||||
v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
|
v = pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
|
||||||
for delete in v:
|
for delete in v:
|
||||||
delete.extract()
|
delete.extract()
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
if appendtag.find('div', attrs={'class':'article_paginator'}):
|
if appendtag.find('div', attrs={'class':'article_paginator'}):
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class DziennikWschodni(BasicNewsRecipe):
|
class DziennikWschodni(BasicNewsRecipe):
|
||||||
title = u'Dziennik Wschodni'
|
title = u'Dziennik Wschodni'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -73,6 +75,10 @@ class DziennikWschodni(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class EchoDnia(BasicNewsRecipe):
|
class EchoDnia(BasicNewsRecipe):
|
||||||
title = u'Echo Dnia'
|
title = u'Echo Dnia'
|
||||||
@ -69,6 +70,10 @@ class EchoDnia(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
38
recipes/eclipseonline.recipe
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class EclipseOnline(BasicNewsRecipe):
|
||||||
|
|
||||||
|
#
|
||||||
|
# oldest_article specifies the maximum age, in days, of posts to retrieve.
|
||||||
|
# The default of 32 is intended to work well with a "days of month = 1"
|
||||||
|
# recipe schedule to download "monthly issues" of Eclipse Online.
|
||||||
|
# Increase this value to include additional posts. However, the RSS feed
|
||||||
|
# currently only includes the 10 most recent posts, so that's the max.
|
||||||
|
#
|
||||||
|
oldest_article = 32
|
||||||
|
|
||||||
|
title = u'Eclipse Online'
|
||||||
|
description = u'"Where strange and wonderful things happen, where reality is eclipsed for a little while with something magical and new." Eclipse Online is edited by Jonathan Strahan and published online by Night Shade Books. http://www.nightshadebooks.com/category/eclipse/'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
__author__ = u'Jim DeVona'
|
||||||
|
__version__ = '1.0'
|
||||||
|
|
||||||
|
# For now, use this Eclipse Online logo as the ebook cover image.
|
||||||
|
# (Disable the cover_url line to let Calibre generate a default cover, including date.)
|
||||||
|
cover_url = 'http://www.nightshadebooks.com/wp-content/uploads/2012/10/Eclipse-Logo.jpg'
|
||||||
|
|
||||||
|
# Extract the "post" div containing the story (minus redundant metadata) from each page.
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':lambda x: x and 'post' in x})]
|
||||||
|
remove_tags = [dict(name='span', attrs={'class': ['post-author', 'post-category', 'small']})]
|
||||||
|
|
||||||
|
# Nice plain markup (like Eclipse's) works best for most e-readers.
|
||||||
|
# Disregard any special styling rules, but center illustrations.
|
||||||
|
auto_cleanup = False
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_attributes = ['style', 'align']
|
||||||
|
extra_css = '.wp-caption {text-align: center;} .wp-caption-text {font-size: small; font-style: italic;}'
|
||||||
|
|
||||||
|
# Tell Calibre where to look for article links. It will proceed to retrieve
|
||||||
|
# these posts and format them into an ebook according to the above rules.
|
||||||
|
feeds = ['http://www.nightshadebooks.com/category/eclipse/feed/']
|
@ -9,7 +9,7 @@ class EkologiaPl(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png'
|
cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png'
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
extra_css = '.title {font-size: 200%;}'
|
extra_css = '.title {font-size: 200%;} .imagePowiazane, .imgCon {float:left; margin-right:5px;}'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
@ -3,29 +3,37 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, matek09, matek09@gmail.com'
|
__copyright__ = '2010, matek09, matek09@gmail.com'
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
import re
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment
|
||||||
|
|
||||||
class Esensja(BasicNewsRecipe):
|
class Esensja(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'Esensja'
|
title = u'Esensja'
|
||||||
__author__ = 'matek09'
|
__author__ = 'matek09 & fenuks'
|
||||||
description = 'Monthly magazine'
|
description = 'Magazyn kultury popularnej'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
||||||
|
oldest_article = 1
|
||||||
|
URL = 'http://esensja.pl'
|
||||||
HREF = '0'
|
HREF = '0'
|
||||||
|
remove_attributes = ['style', 'bgcolor', 'alt', 'color']
|
||||||
#keep_only_tags =[]
|
keep_only_tags = [dict(attrs={'class':'sekcja'}), ]
|
||||||
#keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'})
|
#keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'})
|
||||||
remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'}))
|
#remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'}))
|
||||||
remove_tags_after = dict(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'}))
|
remove_tags_after = dict(id='tekst')
|
||||||
|
|
||||||
remove_tags =[]
|
remove_tags = [dict(name = 'img', attrs = {'src' : ['../../../2000/01/img/tab_top.gif', '../../../2000/01/img/tab_bot.gif']}),
|
||||||
remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_top.gif'}))
|
dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}),
|
||||||
remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'}))
|
#dict(attrs={'rel':'lightbox[galeria]'})
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}))
|
dict(attrs={'class':['tekst_koniec', 'ref', 'wykop']}),
|
||||||
|
dict(attrs={'itemprop':['copyrightHolder', 'publisher']}),
|
||||||
|
dict(id='komentarze')
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.t-title {font-size: x-large; font-weight: bold; text-align: left}
|
.t-title {font-size: x-large; font-weight: bold; text-align: left}
|
||||||
@ -35,8 +43,9 @@ class Esensja(BasicNewsRecipe):
|
|||||||
.annot-ref {font-style: italic; text-align: left}
|
.annot-ref {font-style: italic; text-align: left}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(r'alt="[^"]*"'),
|
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
|
||||||
lambda match: '')]
|
(re.compile(ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
|
||||||
|
]
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
|
soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
|
||||||
@ -47,15 +56,19 @@ class Esensja(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup(self.HREF + '01.html')
|
soup = self.index_to_soup(self.HREF + '01.html')
|
||||||
self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg'
|
self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg'
|
||||||
feeds = []
|
feeds = []
|
||||||
|
chapter = ''
|
||||||
|
subchapter = ''
|
||||||
|
articles = []
|
||||||
intro = soup.find('div', attrs={'class' : 'n-title'})
|
intro = soup.find('div', attrs={'class' : 'n-title'})
|
||||||
|
'''
|
||||||
introduction = {'title' : self.tag_to_string(intro.a),
|
introduction = {'title' : self.tag_to_string(intro.a),
|
||||||
'url' : self.HREF + intro.a['href'],
|
'url' : self.HREF + intro.a['href'],
|
||||||
'date' : '',
|
'date' : '',
|
||||||
'description' : ''}
|
'description' : ''}
|
||||||
chapter = 'Wprowadzenie'
|
chapter = 'Wprowadzenie'
|
||||||
subchapter = ''
|
|
||||||
articles = []
|
|
||||||
articles.append(introduction)
|
articles.append(introduction)
|
||||||
|
'''
|
||||||
|
|
||||||
for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}):
|
for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}):
|
||||||
if tag.name in 'td':
|
if tag.name in 'td':
|
||||||
if len(articles) > 0:
|
if len(articles) > 0:
|
||||||
@ -71,17 +84,72 @@ class Esensja(BasicNewsRecipe):
|
|||||||
subchapter = self.tag_to_string(tag)
|
subchapter = self.tag_to_string(tag)
|
||||||
subchapter = self.tag_to_string(tag)
|
subchapter = self.tag_to_string(tag)
|
||||||
continue
|
continue
|
||||||
articles.append({'title' : self.tag_to_string(tag.a), 'url' : self.HREF + tag.a['href'], 'date' : '', 'description' : ''})
|
|
||||||
|
|
||||||
a = self.index_to_soup(self.HREF + tag.a['href'])
|
finalurl = tag.a['href']
|
||||||
|
if not finalurl.startswith('http'):
|
||||||
|
finalurl = self.HREF + finalurl
|
||||||
|
articles.append({'title' : self.tag_to_string(tag.a), 'url' : finalurl, 'date' : '', 'description' : ''})
|
||||||
|
|
||||||
|
a = self.index_to_soup(finalurl)
|
||||||
i = 1
|
i = 1
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
div = a.find('div', attrs={'class' : 't-title2 nextpage'})
|
div = a.find('div', attrs={'class' : 't-title2 nextpage'})
|
||||||
if div is not None:
|
if div is not None:
|
||||||
a = self.index_to_soup(self.HREF + div.a['href'])
|
link = div.a['href']
|
||||||
articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : self.HREF + div.a['href'], 'date' : '', 'description' : ''})
|
if not link.startswith('http'):
|
||||||
|
link = self.HREF + link
|
||||||
|
a = self.index_to_soup(link)
|
||||||
|
articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : link, 'date' : '', 'description' : ''})
|
||||||
i = i + 1
|
i = i + 1
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
r = appendtag.find(attrs={'class':'wiecej_xxx'})
|
||||||
|
if r:
|
||||||
|
nr = r.findAll(attrs={'class':'tn-link'})[-1]
|
||||||
|
try:
|
||||||
|
nr = int(nr.a.string)
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
baseurl = soup.find(attrs={'property':'og:url'})['content'] + '&strona={0}'
|
||||||
|
for number in range(2, nr+1):
|
||||||
|
soup2 = self.index_to_soup(baseurl.format(number))
|
||||||
|
pagetext = soup2.find(attrs={'class':'tresc'})
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
for r in appendtag.findAll(attrs={'class':['wiecej_xxx', 'tekst_koniec']}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll('script'):
|
||||||
|
r.extract()
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
for tag in soup.findAll(attrs={'class':'img_box_right'}):
|
||||||
|
temp = tag.find('img')
|
||||||
|
src = ''
|
||||||
|
if temp:
|
||||||
|
src = temp.get('src', '')
|
||||||
|
for r in tag.findAll('a', recursive=False):
|
||||||
|
r.extract()
|
||||||
|
info = tag.find(attrs={'class':'img_info'})
|
||||||
|
text = str(tag)
|
||||||
|
if not src:
|
||||||
|
src = re.search('src="[^"]*?"', text)
|
||||||
|
if src:
|
||||||
|
src = src.group(0)
|
||||||
|
src = src[5:].replace('//', '/')
|
||||||
|
if src:
|
||||||
|
tag.contents = []
|
||||||
|
tag.insert(0, BeautifulSoup('<img src="{0}{1}" />'.format(self.URL, src)))
|
||||||
|
if info:
|
||||||
|
tag.insert(len(tag.contents), info)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
109
recipes/esensja_(rss).recipe
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment
|
||||||
|
|
||||||
|
class EsensjaRSS(BasicNewsRecipe):
|
||||||
|
title = u'Esensja (RSS)'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Magazyn kultury popularnej'
|
||||||
|
category = 'reading, fantasy, reviews, boardgames, culture'
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
INDEX = 'http://www.esensja.pl'
|
||||||
|
extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left}
|
||||||
|
.t-author {font-size: x-small; text-align: left}
|
||||||
|
.t-title2 {font-size: x-small; font-style: italic; text-align: left}
|
||||||
|
.text {font-size: small; text-align: left}
|
||||||
|
.annot-ref {font-style: italic; text-align: left}
|
||||||
|
'''
|
||||||
|
cover_url = ''
|
||||||
|
masthead_url = 'http://esensja.pl/img/wrss.gif'
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
|
||||||
|
(re.compile(ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
|
||||||
|
]
|
||||||
|
remove_attributes = ['style', 'bgcolor', 'alt', 'color']
|
||||||
|
keep_only_tags = [dict(attrs={'class':'sekcja'}), ]
|
||||||
|
remove_tags_after = dict(id='tekst')
|
||||||
|
|
||||||
|
remove_tags = [dict(name = 'img', attrs = {'src' : ['../../../2000/01/img/tab_top.gif', '../../../2000/01/img/tab_bot.gif']}),
|
||||||
|
dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}),
|
||||||
|
#dict(attrs={'rel':'lightbox[galeria]'})
|
||||||
|
dict(attrs={'class':['tekst_koniec', 'ref', 'wykop']}),
|
||||||
|
dict(attrs={'itemprop':['copyrightHolder', 'publisher']}),
|
||||||
|
dict(id='komentarze')
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'Książka', u'http://esensja.pl/rss/ksiazka.rss'),
|
||||||
|
(u'Film', u'http://esensja.pl/rss/film.rss'),
|
||||||
|
(u'Komiks', u'http://esensja.pl/rss/komiks.rss'),
|
||||||
|
(u'Gry', u'http://esensja.pl/rss/gry.rss'),
|
||||||
|
(u'Muzyka', u'http://esensja.pl/rss/muzyka.rss'),
|
||||||
|
(u'Twórczość', u'http://esensja.pl/rss/tworczosc.rss'),
|
||||||
|
(u'Varia', u'http://esensja.pl/rss/varia.rss'),
|
||||||
|
(u'Zgryźliwi Tetrycy', u'http://esensja.pl/rss/tetrycy.rss'),
|
||||||
|
(u'Nowe książki', u'http://esensja.pl/rss/xnowosci.rss'),
|
||||||
|
(u'Ostatnio dodane książki', u'http://esensja.pl/rss/xdodane.rss'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
cover = soup.find(id='panel_1')
|
||||||
|
self.cover_url = self.INDEX + cover.find('a')['href'].replace('index.html', '') + 'img/ilustr/cover_b.jpg'
|
||||||
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
r = appendtag.find(attrs={'class':'wiecej_xxx'})
|
||||||
|
if r:
|
||||||
|
nr = r.findAll(attrs={'class':'tn-link'})[-1]
|
||||||
|
try:
|
||||||
|
nr = int(nr.a.string)
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
baseurl = soup.find(attrs={'property':'og:url'})['content'] + '&strona={0}'
|
||||||
|
for number in range(2, nr+1):
|
||||||
|
soup2 = self.index_to_soup(baseurl.format(number))
|
||||||
|
pagetext = soup2.find(attrs={'class':'tresc'})
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
for r in appendtag.findAll(attrs={'class':['wiecej_xxx', 'tekst_koniec']}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll('script'):
|
||||||
|
r.extract()
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
for tag in soup.findAll(attrs={'class':'img_box_right'}):
|
||||||
|
temp = tag.find('img')
|
||||||
|
src = ''
|
||||||
|
if temp:
|
||||||
|
src = temp.get('src', '')
|
||||||
|
for r in tag.findAll('a', recursive=False):
|
||||||
|
r.extract()
|
||||||
|
info = tag.find(attrs={'class':'img_info'})
|
||||||
|
text = str(tag)
|
||||||
|
if not src:
|
||||||
|
src = re.search('src="[^"]*?"', text)
|
||||||
|
if src:
|
||||||
|
src = src.group(0)
|
||||||
|
src = src[5:].replace('//', '/')
|
||||||
|
if src:
|
||||||
|
tag.contents = []
|
||||||
|
tag.insert(0, BeautifulSoup('<img src="{0}{1}" />'.format(self.INDEX, src)))
|
||||||
|
if info:
|
||||||
|
tag.insert(len(tag.contents), info)
|
||||||
|
return soup
|
@ -1,5 +1,6 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
import re
|
import re
|
||||||
class FilmOrgPl(BasicNewsRecipe):
|
class FilmOrgPl(BasicNewsRecipe):
|
||||||
title = u'Film.org.pl'
|
title = u'Film.org.pl'
|
||||||
@ -7,13 +8,47 @@ class FilmOrgPl(BasicNewsRecipe):
|
|||||||
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce."
|
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce."
|
||||||
category = 'film'
|
category = 'film'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
|
extra_css = '.alignright {float:right; margin-left:5px;} .alignleft {float:left; margin-right:5px;} .recenzja-title {font-size: 150%; margin-top: 5px; margin-bottom: 5px;}'
|
||||||
cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png'
|
cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png'
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
use_embedded_content = True
|
use_embedded_content = False
|
||||||
preprocess_regexps = [(re.compile(ur'<h3>Przeczytaj także:</h3>.*', re.IGNORECASE|re.DOTALL), lambda m: '</body>'), (re.compile(ur'<div>Artykuł</div>', re.IGNORECASE), lambda m: ''), (re.compile(ur'<div>Ludzie filmu</div>', re.IGNORECASE), lambda m: '')]
|
remove_attributes = ['style']
|
||||||
remove_tags = [dict(name='img', attrs={'alt':['Ludzie filmu', u'Artykuł']})]
|
preprocess_regexps = [(re.compile(ur'<h3>Przeczytaj także:</h3>.*', re.IGNORECASE|re.DOTALL), lambda m: '</body>'), (re.compile(ur'</?center>', re.IGNORECASE|re.DOTALL), lambda m: ''), (re.compile(ur'<div>Artykuł</div>', re.IGNORECASE), lambda m: ''), (re.compile(ur'<div>Ludzie filmu</div>', re.IGNORECASE), lambda m: ''), (re.compile(ur'(<br ?/?>\s*?){2,}', re.IGNORECASE|re.DOTALL), lambda m: '')]
|
||||||
|
keep_only_tags = [dict(name=['h11', 'h16', 'h17']), dict(attrs={'class':'editor'})]
|
||||||
|
remove_tags_after = dict(id='comments')
|
||||||
|
remove_tags = [dict(name=['link', 'meta', 'style']), dict(name='img', attrs={'alt':['Ludzie filmu', u'Artykuł']}), dict(id='comments'), dict(attrs={'style':'border: 0pt none ; margin: 0pt; padding: 0pt;'}), dict(name='p', attrs={'class':'rating'}), dict(attrs={'layout':'button_count'})]
|
||||||
feeds = [(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')]
|
feeds = [(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')]
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = soup.find('div', attrs={'class': 'pagelink'})
|
||||||
|
if tag:
|
||||||
|
for nexturl in tag.findAll('a'):
|
||||||
|
url = nexturl['href']
|
||||||
|
soup2 = self.index_to_soup(url)
|
||||||
|
pagetext = soup2.find(attrs={'class': 'editor'})
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
for r in appendtag.findAll(attrs={'class': 'pagelink'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'id': 'comments'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'style':'border: 0pt none ; margin: 0pt; padding: 0pt;'}):
|
||||||
|
r.extract()
|
||||||
|
for r in appendtag.findAll(attrs={'layout':'button_count'}):
|
||||||
|
r.extract()
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for c in soup.findAll('h11'):
|
||||||
|
c.name = 'h1'
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
for r in soup.findAll('br'):
|
||||||
|
r.extract()
|
||||||
|
return soup
|
@ -1,6 +1,7 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
import re
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
class FilmWebPl(BasicNewsRecipe):
|
class FilmWebPl(BasicNewsRecipe):
|
||||||
title = u'FilmWeb'
|
title = u'FilmWeb'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -14,11 +15,12 @@ class FilmWebPl(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
|
remove_javascript = True
|
||||||
|
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), (re.compile(ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />')]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
|
||||||
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
|
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
|
||||||
remove_tags = [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
|
#remove_tags = [dict()]
|
||||||
remove_attributes = ['style',]
|
remove_attributes = ['style',]
|
||||||
keep_only_tags = [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
|
keep_only_tags = [dict(attrs={'class':['hdr hdr-super', 'newsContent']})]
|
||||||
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
|
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
|
||||||
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
|
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
|
||||||
(u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'),
|
(u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'),
|
||||||
@ -42,6 +44,11 @@ class FilmWebPl(BasicNewsRecipe):
|
|||||||
if skip_tag is not None:
|
if skip_tag is not None:
|
||||||
return self.index_to_soup(skip_tag['href'], raw=True)
|
return self.index_to_soup(skip_tag['href'], raw=True)
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first_fetch):
|
||||||
|
for r in soup.findAll(attrs={'class':'singlephoto'}):
|
||||||
|
r['style'] = 'float:left; margin-right: 10px;'
|
||||||
|
return soup
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
@ -51,9 +58,8 @@ class FilmWebPl(BasicNewsRecipe):
|
|||||||
for i in soup.findAll('sup'):
|
for i in soup.findAll('sup'):
|
||||||
if not i.string or i.string.startswith('(kliknij'):
|
if not i.string or i.string.startswith('(kliknij'):
|
||||||
i.extract()
|
i.extract()
|
||||||
tag = soup.find(name='ul', attrs={'class':'inline sep-line'})
|
for r in soup.findAll(id=re.compile('photo-\d+')):
|
||||||
if tag:
|
r.extract()
|
||||||
tag.name = 'div'
|
for r in soup.findAll(style=re.compile('float: ?left')):
|
||||||
for t in tag.findAll('li'):
|
r['class'] = 'singlephoto'
|
||||||
t.name = 'div'
|
|
||||||
return soup
|
return soup
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
|
|
||||||
class ForeignAffairsRecipe(BasicNewsRecipe):
|
class ForeignAffairsRecipe(BasicNewsRecipe):
|
||||||
''' there are three modifications:
|
''' there are three modifications:
|
||||||
@ -45,7 +44,6 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
'publisher': publisher}
|
'publisher': publisher}
|
||||||
|
|
||||||
temp_files = []
|
temp_files = []
|
||||||
articles_are_obfuscated = True
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup(self.FRONTPAGE)
|
soup = self.index_to_soup(self.FRONTPAGE)
|
||||||
@ -53,20 +51,6 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
img_url = div.find('img')['src']
|
img_url = div.find('img')['src']
|
||||||
return self.INDEX + img_url
|
return self.INDEX + img_url
|
||||||
|
|
||||||
def get_obfuscated_article(self, url):
|
|
||||||
br = self.get_browser()
|
|
||||||
br.open(url)
|
|
||||||
|
|
||||||
response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
|
|
||||||
html = response.read()
|
|
||||||
|
|
||||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
|
||||||
self.temp_files[-1].write(html)
|
|
||||||
self.temp_files[-1].close()
|
|
||||||
|
|
||||||
return self.temp_files[-1].name
|
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
|
||||||
answer = []
|
answer = []
|
||||||
@ -89,10 +73,10 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
if div.find('a') is not None:
|
if div.find('a') is not None:
|
||||||
originalauthor=self.tag_to_string(div.findNext('div', attrs = {'class':'views-field-field-article-book-nid'}).div.a)
|
originalauthor=self.tag_to_string(div.findNext('div', attrs = {'class':'views-field-field-article-book-nid'}).div.a)
|
||||||
title=subsectiontitle+': '+self.tag_to_string(div.span.a)+' by '+originalauthor
|
title=subsectiontitle+': '+self.tag_to_string(div.span.a)+' by '+originalauthor
|
||||||
url=self.INDEX+div.span.a['href']
|
url=self.INDEX+self.index_to_soup(self.INDEX+div.span.a['href']).find('a', attrs={'class':'fa_addthis_print'})['href']
|
||||||
atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
|
atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
|
||||||
if atr is not None:
|
if atr is not None:
|
||||||
author=self.tag_to_string(atr.span.a)
|
author=self.tag_to_string(atr.span)
|
||||||
else:
|
else:
|
||||||
author=''
|
author=''
|
||||||
desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
||||||
@ -106,10 +90,10 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
for div in sec.findAll('div', attrs = {'class': 'views-field-title'}):
|
for div in sec.findAll('div', attrs = {'class': 'views-field-title'}):
|
||||||
if div.find('a') is not None:
|
if div.find('a') is not None:
|
||||||
title=self.tag_to_string(div.span.a)
|
title=self.tag_to_string(div.span.a)
|
||||||
url=self.INDEX+div.span.a['href']
|
url=self.INDEX+self.index_to_soup(self.INDEX+div.span.a['href']).find('a', attrs={'class':'fa_addthis_print'})['href']
|
||||||
atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
|
atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
|
||||||
if atr is not None:
|
if atr is not None:
|
||||||
author=self.tag_to_string(atr.span.a)
|
author=self.tag_to_string(atr.span)
|
||||||
else:
|
else:
|
||||||
author=''
|
author=''
|
||||||
desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
||||||
|
75
recipes/fortune_magazine.recipe
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
class Fortune(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Fortune Magazine'
|
||||||
|
__author__ = 'Rick Shang'
|
||||||
|
|
||||||
|
description = 'FORTUNE is a global business magazine that has been revered in its content and credibility since 1930. FORTUNE covers the entire field of business, including specific companies and business trends, prominent business leaders, and new ideas shaping the global marketplace.'
|
||||||
|
language = 'en'
|
||||||
|
category = 'news'
|
||||||
|
encoding = 'UTF-8'
|
||||||
|
keep_only_tags = [dict(attrs={'id':['storycontent']})]
|
||||||
|
remove_tags = [dict(attrs={'class':['hed_side','socialMediaToolbarContainer']})]
|
||||||
|
no_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
needs_subscription = True
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
|
br.open('http://money.cnn.com/2013/03/21/smallbusiness/legal-marijuana-startups.pr.fortune/index.html')
|
||||||
|
br.select_form(name="paywall-form")
|
||||||
|
br['email'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
articles = []
|
||||||
|
soup0 = self.index_to_soup('http://money.cnn.com/magazines/fortune/')
|
||||||
|
|
||||||
|
#Go to the latestissue
|
||||||
|
soup = self.index_to_soup(soup0.find('div',attrs={'class':'latestissue'}).find('a',href=True)['href'])
|
||||||
|
|
||||||
|
#Find cover & date
|
||||||
|
cover_item = soup.find('div', attrs={'id':'cover-story'})
|
||||||
|
cover = cover_item.find('img',src=True)
|
||||||
|
self.cover_url = cover['src']
|
||||||
|
date = self.tag_to_string(cover_item.find('div', attrs={'class':'tocDate'})).strip()
|
||||||
|
self.timefmt = u' [%s]'%date
|
||||||
|
|
||||||
|
|
||||||
|
feeds = OrderedDict()
|
||||||
|
section_title = ''
|
||||||
|
|
||||||
|
#checkout the cover story
|
||||||
|
articles = []
|
||||||
|
coverstory=soup.find('div', attrs={'class':'cnnHeadline'})
|
||||||
|
title=self.tag_to_string(coverstory.a).strip()
|
||||||
|
url=coverstory.a['href']
|
||||||
|
desc=self.tag_to_string(coverstory.findNext('p', attrs={'class':'cnnBlurbTxt'}))
|
||||||
|
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
|
||||||
|
feeds['Cover Story'] = []
|
||||||
|
feeds['Cover Story'] += articles
|
||||||
|
|
||||||
|
for post in soup.findAll('div', attrs={'class':'cnnheader'}):
|
||||||
|
section_title = self.tag_to_string(post).strip()
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
ul=post.findNext('ul')
|
||||||
|
for link in ul.findAll('li'):
|
||||||
|
links=link.find('h2')
|
||||||
|
title=self.tag_to_string(links.a).strip()
|
||||||
|
url=links.a['href']
|
||||||
|
desc=self.tag_to_string(link.find('p', attrs={'class':'cnnBlurbTxt'}))
|
||||||
|
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
|
||||||
|
|
||||||
|
if articles:
|
||||||
|
if section_title not in feeds:
|
||||||
|
feeds[section_title] = []
|
||||||
|
feeds[section_title] += articles
|
||||||
|
|
||||||
|
ans = [(key, val) for key, val in feeds.iteritems()]
|
||||||
|
return ans
|
||||||
|
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class GazetaLubuska(BasicNewsRecipe):
|
class GazetaLubuska(BasicNewsRecipe):
|
||||||
title = u'Gazeta Lubuska'
|
title = u'Gazeta Lubuska'
|
||||||
@ -59,6 +60,10 @@ class GazetaLubuska(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class GazetaPomorska(BasicNewsRecipe):
|
class GazetaPomorska(BasicNewsRecipe):
|
||||||
title = u'Gazeta Pomorska'
|
title = u'Gazeta Pomorska'
|
||||||
@ -86,6 +87,10 @@ class GazetaPomorska(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class GazetaWspolczesna(BasicNewsRecipe):
|
class GazetaWspolczesna(BasicNewsRecipe):
|
||||||
title = u'Gazeta Wsp\xf3\u0142czesna'
|
title = u'Gazeta Wsp\xf3\u0142czesna'
|
||||||
@ -58,6 +59,10 @@ class GazetaWspolczesna(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||||
title = u'Gazeta.pl'
|
title = u'Gazeta.pl'
|
||||||
@ -16,6 +16,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
remove_tags_before = dict(id='k0')
|
remove_tags_before = dict(id='k0')
|
||||||
remove_tags_after = dict(id='banP4')
|
remove_tags_after = dict(id='banP4')
|
||||||
remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})]
|
remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})]
|
||||||
@ -48,6 +49,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
url = self.INDEX + link['href']
|
url = self.INDEX + link['href']
|
||||||
soup2 = self.index_to_soup(url)
|
soup2 = self.index_to_soup(url)
|
||||||
pagetext = soup2.find(id='artykul')
|
pagetext = soup2.find(id='artykul')
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
tag = soup2.find('div', attrs={'id': 'Str'})
|
tag = soup2.find('div', attrs={'id': 'Str'})
|
||||||
@ -65,6 +69,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
nexturl = pagetext.find(id='gal_btn_next')
|
nexturl = pagetext.find(id='gal_btn_next')
|
||||||
if nexturl:
|
if nexturl:
|
||||||
nexturl = nexturl.a['href']
|
nexturl = nexturl.a['href']
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
rem = appendtag.find(id='gal_navi')
|
rem = appendtag.find(id='gal_navi')
|
||||||
@ -105,3 +112,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href'])
|
soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href'])
|
||||||
self.cover_url = 'http://wyborcza.pl' + soup.img['src']
|
self.cover_url = 'http://wyborcza.pl' + soup.img['src']
|
||||||
return getattr(self, 'cover_url', self.cover_url)
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
|
||||||
|
'''def image_url_processor(self, baseurl, url):
|
||||||
|
print "@@@@@@@@", url
|
||||||
|
return url.replace('http://wyborcza.pl/ ', '')'''
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class GCN(BasicNewsRecipe):
|
class GCN(BasicNewsRecipe):
|
||||||
title = u'Gazeta Codziennej Nowiny'
|
title = u'Gazeta Codziennej Nowiny'
|
||||||
@ -16,7 +17,7 @@ class GCN(BasicNewsRecipe):
|
|||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
remove_attributes = ['style']
|
||||||
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||||
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||||
|
|
||||||
@ -78,6 +79,10 @@ class GCN(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -11,12 +11,13 @@ class Gildia(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_empty_feeds=True
|
remove_empty_feeds = True
|
||||||
no_stylesheets=True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '') ]
|
preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '') ]
|
||||||
remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
keep_only_tags=dict(name='div', attrs={'class':'widetext'})
|
remove_tags = [dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
|
||||||
|
keep_only_tags = dict(name='div', attrs={'class':'widetext'})
|
||||||
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')]
|
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')]
|
||||||
|
|
||||||
|
|
||||||
@ -34,7 +35,7 @@ class Gildia(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and not a['href'].startswith('http'):
|
||||||
if '/gry/' in a['href']:
|
if '/gry/' in a['href']:
|
||||||
a['href']='http://www.gry.gildia.pl' + a['href']
|
a['href']='http://www.gry.gildia.pl' + a['href']
|
||||||
elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
|
elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
|
||||||
|
26
recipes/gofin_pl.recipe
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
gofin.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class gofin(BasicNewsRecipe):
|
||||||
|
title = u'Gofin'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'Portal Podatkowo-Księgowy'
|
||||||
|
INDEX='http://gofin.pl'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_empty_feeds= True
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Podatki', u'http://www.rss.gofin.pl/podatki.xml'), (u'Prawo Pracy', u'http://www.rss.gofin.pl/prawopracy.xml'), (u'Rachunkowo\u015b\u0107', u'http://www.rss.gofin.pl/rachunkowosc.xml'), (u'Sk\u0142adki, zasi\u0142ki, emerytury', u'http://www.rss.gofin.pl/zasilki.xml'),(u'Firma', u'http://www.rss.gofin.pl/firma.xml'), (u'Prawnik radzi', u'http://www.rss.gofin.pl/prawnikradzi.xml')]
|
@ -1,5 +1,6 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
class Gram_pl(BasicNewsRecipe):
|
class Gram_pl(BasicNewsRecipe):
|
||||||
title = u'Gram.pl'
|
title = u'Gram.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -15,7 +16,7 @@ class Gram_pl(BasicNewsRecipe):
|
|||||||
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
||||||
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
||||||
keep_only_tags= [dict(id='articleModule')]
|
keep_only_tags= [dict(id='articleModule')]
|
||||||
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']})]
|
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']}), dict(name='aside')]
|
||||||
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
||||||
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')
|
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')
|
||||||
]
|
]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import time
|
import time
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class GryOnlinePl(BasicNewsRecipe):
|
class GryOnlinePl(BasicNewsRecipe):
|
||||||
title = u'Gry-Online.pl'
|
title = u'Gry-Online.pl'
|
||||||
@ -40,10 +41,14 @@ class GryOnlinePl(BasicNewsRecipe):
|
|||||||
r.extract()
|
r.extract()
|
||||||
for r in pagetext.findAll(attrs={'itemprop':'description'}):
|
for r in pagetext.findAll(attrs={'itemprop':'description'}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}):
|
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
else:
|
else:
|
||||||
tag = appendtag.find('div', attrs={'class':'S018stronyr'})
|
tag = appendtag.find('div', attrs={'class':'S018stronyr'})
|
||||||
if tag:
|
if tag:
|
||||||
@ -70,10 +75,16 @@ class GryOnlinePl(BasicNewsRecipe):
|
|||||||
r.extract()
|
r.extract()
|
||||||
for r in pagetext.findAll(attrs={'itemprop':'description'}):
|
for r in pagetext.findAll(attrs={'itemprop':'description'}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
[comment.extract() for comment in comments]
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony']}):
|
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony']}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def image_url_processor(self, baseurl, url):
|
def image_url_processor(self, baseurl, url):
|
||||||
if url.startswith('..'):
|
if url.startswith('..'):
|
||||||
|
@ -77,10 +77,9 @@ class Harpers_full(BasicNewsRecipe):
|
|||||||
self.timefmt = u' [%s]'%date
|
self.timefmt = u' [%s]'%date
|
||||||
|
|
||||||
#get cover
|
#get cover
|
||||||
coverurl='http://harpers.org/wp-content/themes/harpers/ajax_microfiche.php?img=harpers-'+re.split('harpers.org/',currentIssue_url)[1]+'gif/0001.gif'
|
self.cover_url = soup1.find('div', attrs = {'class':'picture_hp'}).find('img', src=True)['src']
|
||||||
soup2 = self.index_to_soup(coverurl)
|
|
||||||
self.cover_url = self.tag_to_string(soup2.find('img')['src'])
|
|
||||||
self.log(self.cover_url)
|
self.log(self.cover_url)
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
count = 0
|
count = 0
|
||||||
for item in soup1.findAll('div', attrs={'class':'articleData'}):
|
for item in soup1.findAll('div', attrs={'class':'articleData'}):
|
||||||
|
@ -2,7 +2,6 @@ from __future__ import with_statement
|
|||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
|
||||||
import time
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class TheHindu(BasicNewsRecipe):
|
class TheHindu(BasicNewsRecipe):
|
||||||
@ -14,44 +13,42 @@ class TheHindu(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
keep_only_tags = [dict(id='content')]
|
auto_cleanup = True
|
||||||
remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
|
|
||||||
dict(id=['email-section', 'right-column', 'printfooter', 'topover',
|
|
||||||
'slidebox', 'th_footer'])]
|
|
||||||
|
|
||||||
extra_css = '.photo-caption { font-size: smaller }'
|
extra_css = '.photo-caption { font-size: smaller }'
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, url):
|
|
||||||
return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
|
|
||||||
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
|
||||||
for t in soup.findAll(['table', 'tr', 'td','center']):
|
|
||||||
t.name = 'div'
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
today = time.strftime('%Y-%m-%d')
|
soup = self.index_to_soup('http://www.thehindu.com/todays-paper/')
|
||||||
soup = self.index_to_soup(
|
div = soup.find('div', attrs={'id':'left-column'})
|
||||||
'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
|
soup.find(id='subnav-tpbar').extract()
|
||||||
div = soup.find(id='left-column')
|
|
||||||
feeds = []
|
|
||||||
|
|
||||||
current_section = None
|
current_section = None
|
||||||
current_articles = []
|
current_articles = []
|
||||||
for x in div.findAll(['h3', 'div']):
|
feeds = []
|
||||||
if current_section and x.get('class', '') == 'tpaper':
|
for x in div.findAll(['a', 'span']):
|
||||||
a = x.find('a', href=True)
|
if x.name == 'span' and x['class'] == 's-link':
|
||||||
if a is not None:
|
# Section heading found
|
||||||
title = self.tag_to_string(a)
|
if current_articles and current_section:
|
||||||
self.log('\tFound article:', title)
|
|
||||||
current_articles.append({'url':a['href']+'?css=print',
|
|
||||||
'title':title, 'date': '',
|
|
||||||
'description':''})
|
|
||||||
if x.name == 'h3':
|
|
||||||
if current_section and current_articles:
|
|
||||||
feeds.append((current_section, current_articles))
|
feeds.append((current_section, current_articles))
|
||||||
current_section = self.tag_to_string(x)
|
current_section = self.tag_to_string(x)
|
||||||
self.log('Found section:', current_section)
|
|
||||||
current_articles = []
|
current_articles = []
|
||||||
|
self.log('\tFound section:', current_section)
|
||||||
|
elif x.name == 'a':
|
||||||
|
|
||||||
|
title = self.tag_to_string(x)
|
||||||
|
url = x.get('href', False)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
current_articles.append({'title': title, 'url':url,
|
||||||
|
'description':'', 'date':''})
|
||||||
|
|
||||||
|
if current_articles and current_section:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,20 +8,15 @@ class Historia_org_pl(BasicNewsRecipe):
|
|||||||
category = 'history'
|
category = 'history'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
|
extra_css = 'img {float: left; margin-right: 10px;} .alignleft {float: left; margin-right: 10px;}'
|
||||||
remove_empty_feeds= True
|
remove_empty_feeds= True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = True
|
use_embedded_content = True
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
|
||||||
feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'),
|
feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'),
|
||||||
(u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'),
|
(u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'),
|
||||||
(u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'),
|
(u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'),
|
||||||
(u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'),
|
(u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'),
|
||||||
(u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'),
|
(u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'),
|
||||||
(u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),]
|
(u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),]
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url + '?tmpl=component&print=1&layout=default&page='
|
|
@ -1,6 +1,6 @@
|
|||||||
import re
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class HistoryToday(BasicNewsRecipe):
|
class HistoryToday(BasicNewsRecipe):
|
||||||
|
|
||||||
@ -19,7 +19,6 @@ class HistoryToday(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
@ -46,8 +45,9 @@ class HistoryToday(BasicNewsRecipe):
|
|||||||
|
|
||||||
#Go to issue
|
#Go to issue
|
||||||
soup = self.index_to_soup('http://www.historytoday.com/contents')
|
soup = self.index_to_soup('http://www.historytoday.com/contents')
|
||||||
cover = soup.find('div',attrs={'id':'content-area'}).find('img')['src']
|
cover = soup.find('div',attrs={'id':'content-area'}).find('img', attrs={'src':re.compile('.*cover.*')})['src']
|
||||||
self.cover_url=cover
|
self.cover_url=cover
|
||||||
|
self.log(self.cover_url)
|
||||||
|
|
||||||
#Go to the main body
|
#Go to the main body
|
||||||
|
|
||||||
@ -84,4 +84,3 @@ class HistoryToday(BasicNewsRecipe):
|
|||||||
|
|
||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
self.browser.open('http://www.historytoday.com/logout')
|
self.browser.open('http://www.historytoday.com/logout')
|
||||||
|
|
||||||
|
BIN
recipes/icons/bachormagazyn.png
Normal file
After Width: | Height: | Size: 898 B |
BIN
recipes/icons/blog_biszopa.png
Normal file
After Width: | Height: | Size: 755 B |
BIN
recipes/icons/esenja.png
Normal file
After Width: | Height: | Size: 329 B |
BIN
recipes/icons/esensja_(rss).png
Normal file
After Width: | Height: | Size: 329 B |
BIN
recipes/icons/gofin_pl.png
Normal file
After Width: | Height: | Size: 618 B |
BIN
recipes/icons/histmag.png
Normal file
After Width: | Height: | Size: 537 B |
Before Width: | Height: | Size: 806 B After Width: | Height: | Size: 869 B |
BIN
recipes/icons/kdefamily_pl.png
Normal file
After Width: | Height: | Size: 857 B |
BIN
recipes/icons/km_blog.png
Normal file
After Width: | Height: | Size: 532 B |
BIN
recipes/icons/ksiazka_pl.png
Normal file
After Width: | Height: | Size: 1.3 KiB |
0
recipes/icons/nowy_obywatel.png
Executable file → Normal file
Before Width: | Height: | Size: 480 B After Width: | Height: | Size: 480 B |
BIN
recipes/icons/optyczne_pl.png
Normal file
After Width: | Height: | Size: 697 B |
BIN
recipes/icons/sport_pl.png
Normal file
After Width: | Height: | Size: 627 B |
BIN
recipes/icons/websecurity_pl.png
Normal file
After Width: | Height: | Size: 863 B |
@ -1,5 +1,7 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class in4(BasicNewsRecipe):
|
class in4(BasicNewsRecipe):
|
||||||
title = u'IN4.pl'
|
title = u'IN4.pl'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -8,14 +10,14 @@ class in4(BasicNewsRecipe):
|
|||||||
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
|
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
|
||||||
category = 'IT'
|
category = 'IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
index='http://www.in4.pl/'
|
index = 'http://www.in4.pl/'
|
||||||
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
|
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
preprocess_regexps = [(re.compile(ur'<a title="translate into.*?</a>', re.DOTALL), lambda match: '') ]
|
preprocess_regexps = [(re.compile(ur'<a title="translate into.*?</a>', re.DOTALL), lambda match: '') ]
|
||||||
keep_only_tags=[dict(name='div', attrs={'class':'left_alone'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'left_alone'})]
|
||||||
remove_tags_after=dict(name='img', attrs={'title':'komentarze'})
|
remove_tags_after = dict(name='img', attrs={'title':'komentarze'})
|
||||||
remove_tags=[dict(name='img', attrs={'title':'komentarze'})]
|
remove_tags = [dict(name='img', attrs={'title':'komentarze'})]
|
||||||
feeds = [(u'Wiadomo\u015bci', u'http://www.in4.pl/rss.php'), (u'Recenzje', u'http://www.in4.pl/rss_recenzje.php'), (u'Mini recenzje', u'http://www.in4.pl/rss_mini.php')]
|
feeds = [(u'Wiadomo\u015bci', u'http://www.in4.pl/rss.php'), (u'Recenzje', u'http://www.in4.pl/rss_recenzje.php'), (u'Mini recenzje', u'http://www.in4.pl/rss_mini.php')]
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
def append_page(self, soup, appendtag):
|
||||||
@ -28,10 +30,13 @@ class in4(BasicNewsRecipe):
|
|||||||
while nexturl:
|
while nexturl:
|
||||||
soup2 = self.index_to_soup(nexturl)
|
soup2 = self.index_to_soup(nexturl)
|
||||||
pagetext = soup2.find(id='news')
|
pagetext = soup2.find(id='news')
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
nexturl=None
|
nexturl = None
|
||||||
tag=soup2.findAll('a')
|
tag = soup2.findAll('a')
|
||||||
for z in tag:
|
for z in tag:
|
||||||
if z.string and u'następna str' in z.string:
|
if z.string and u'następna str' in z.string:
|
||||||
nexturl='http://www.in4.pl/' + z['href']
|
nexturl='http://www.in4.pl/' + z['href']
|
||||||
|
@ -6,16 +6,15 @@ class INFRA(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
|
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
|
||||||
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
|
cover_url = 'http://i.imgur.com/j7hJT.jpg'
|
||||||
category = 'UFO'
|
category = 'UFO'
|
||||||
index='http://infra.org.pl'
|
index='http://infra.org.pl'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheers=True
|
remove_attrs = ['style']
|
||||||
remove_tags_before=dict(name='h2', attrs={'class':'contentheading'})
|
no_stylesheets = True
|
||||||
remove_tags_after=dict(attrs={'class':'pagenav'})
|
keep_only_tags = [dict(id='ja-current-content')]
|
||||||
remove_tags=[dict(attrs={'class':'pagenav'})]
|
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/rss')]
|
||||||
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
|
@ -1,23 +1,24 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
|
__copyright__ = u'2010-2013, Tomasz Dlugosz <tomek3d@gmail.com>'
|
||||||
'''
|
'''
|
||||||
fakty.interia.pl
|
fakty.interia.pl
|
||||||
'''
|
'''
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class InteriaFakty(BasicNewsRecipe):
|
class InteriaFakty(BasicNewsRecipe):
|
||||||
title = u'Interia.pl - Fakty'
|
title = u'Interia.pl - Fakty'
|
||||||
description = u'Fakty ze strony interia.pl'
|
description = u'Fakty ze strony interia.pl'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 7
|
oldest_article = 1
|
||||||
__author__ = u'Tomasz D\u0142ugosz'
|
__author__ = u'Tomasz D\u0142ugosz'
|
||||||
simultaneous_downloads = 2
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
max_articles_per_feed = 100
|
remove_empty_feeds= True
|
||||||
|
use_embedded_content = False
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
feeds = [(u'Kraj', u'http://kanaly.rss.interia.pl/kraj.xml'),
|
feeds = [(u'Kraj', u'http://kanaly.rss.interia.pl/kraj.xml'),
|
||||||
(u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'),
|
(u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'),
|
||||||
@ -26,14 +27,36 @@ class InteriaFakty(BasicNewsRecipe):
|
|||||||
(u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'),
|
(u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'),
|
||||||
(u'Ciekawostki', u'http://kanaly.rss.interia.pl/ciekawostki.xml')]
|
(u'Ciekawostki', u'http://kanaly.rss.interia.pl/ciekawostki.xml')]
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
|
keep_only_tags = [
|
||||||
|
dict(name='h1'),
|
||||||
|
dict(name='div', attrs={'class': ['lead textContent', 'text textContent', 'source']})]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [dict(name='div', attrs={'class':['embed embedAd', 'REMOVE', 'boxHeader']})]
|
||||||
dict(name='div', attrs={'class':'box fontSizeSwitch'}),
|
|
||||||
dict(name='div', attrs={'class':'clear'}),
|
preprocess_regexps = [
|
||||||
dict(name='div', attrs={'class':'embed embedLeft articleEmbedArticleList articleEmbedArticleListTitle'}),
|
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
dict(name='span', attrs={'class':'keywords'})]
|
[
|
||||||
|
(r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'),
|
||||||
|
(r'</div> <div class="source">', lambda match: ''),
|
||||||
|
(r'<p><a href="http://forum.interia.pl.*?</a></p>', lambda match: '')
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
link = article.get('link', None)
|
||||||
|
if link and 'galerie' not in link and link.split('/')[-1]=="story01.htm":
|
||||||
|
link=link.split('/')[-2]
|
||||||
|
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||||
|
'0D': '?', '0E': '-', '0H': ',', '0I': '_', '0N': '.com', '0L': 'http://'}
|
||||||
|
for k, v in encoding.iteritems():
|
||||||
|
link = link.replace(k, v)
|
||||||
|
return link
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
chunks = url.split(',')
|
||||||
|
return chunks[0] + '/podglad-wydruku'+ ',' + ','.join(chunks[1:])
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h2 { font-size: 1.2em; }
|
h1 { font-size:130% }
|
||||||
|
div.info { font-style:italic; font-size:70%}
|
||||||
'''
|
'''
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
|
__copyright__ = u'2010-2013, Tomasz Dlugosz <tomek3d@gmail.com>'
|
||||||
'''
|
'''
|
||||||
sport.interia.pl
|
sport.interia.pl
|
||||||
'''
|
'''
|
||||||
@ -13,61 +13,51 @@ class InteriaSport(BasicNewsRecipe):
|
|||||||
title = u'Interia.pl - Sport'
|
title = u'Interia.pl - Sport'
|
||||||
description = u'Sport ze strony interia.pl'
|
description = u'Sport ze strony interia.pl'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 7
|
oldest_article = 1
|
||||||
__author__ = u'Tomasz D\u0142ugosz'
|
__author__ = u'Tomasz D\u0142ugosz'
|
||||||
simultaneous_downloads = 3
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
max_articles_per_feed = 100
|
remove_empty_feeds= True
|
||||||
|
use_embedded_content = False
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'),
|
feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'),
|
||||||
(u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'),
|
(u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'),
|
||||||
(u'Siatk\xf3wka', u'http://kanaly.rss.interia.pl/siatkowka.xml'),
|
|
||||||
(u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'),
|
(u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'),
|
||||||
(u'NBA', u'http://kanaly.rss.interia.pl/nba.xml'),
|
|
||||||
(u'Kolarstwo', u'http://kanaly.rss.interia.pl/kolarstwo.xml'),
|
|
||||||
(u'\u017bu\u017cel', u'http://kanaly.rss.interia.pl/zuzel.xml'),
|
|
||||||
(u'Tenis', u'http://kanaly.rss.interia.pl/tenis.xml')]
|
(u'Tenis', u'http://kanaly.rss.interia.pl/tenis.xml')]
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
|
keep_only_tags = [
|
||||||
|
dict(name='h1'),
|
||||||
|
dict(name='div', attrs={'class': ['lead textContent', 'text textContent', 'source']})]
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'class':'object gallery'}),
|
remove_tags = [dict(name='div', attrs={'class':['embed embedAd', 'REMOVE', 'boxHeader']})]
|
||||||
dict(name='div', attrs={'class':'box fontSizeSwitch'})]
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
.articleDate {
|
|
||||||
font-size: 0.5em;
|
|
||||||
color: black;
|
|
||||||
}
|
|
||||||
|
|
||||||
.articleFoto {
|
|
||||||
display: block;
|
|
||||||
font-family: sans;
|
|
||||||
font-size: 0.5em;
|
|
||||||
text-indent: 0
|
|
||||||
color: black;
|
|
||||||
}
|
|
||||||
|
|
||||||
.articleText {
|
|
||||||
display: block;
|
|
||||||
margin-bottom: 1em;
|
|
||||||
margin-left: 0;
|
|
||||||
margin-right: 0;
|
|
||||||
margin-top: 1em
|
|
||||||
color: black;
|
|
||||||
}
|
|
||||||
|
|
||||||
.articleLead {
|
|
||||||
font-size: 1.2em;
|
|
||||||
}
|
|
||||||
'''
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
[
|
[
|
||||||
(r'<p><a href.*?</a></p>', lambda match: ''),
|
(r'<p><a href.*?</a></p>', lambda match: ''),
|
||||||
# FIXME
|
(r'<p>(<i>)?<b>(ZOBACZ|CZYTAJ) T.*?</div>', lambda match: '</div>'),
|
||||||
#(r'(<div id="newsAddContent">)(.*?)(<a href=".*">)(.*?)(</a>)', lambda match: '\1\2\4'),
|
(r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'),
|
||||||
(r'<p>(<i>)?<b>(ZOBACZ|CZYTAJ) T.*?</div>', lambda match: '</div>')
|
(r'</div> <div class="source">', lambda match: ''),
|
||||||
|
(r'<p><a href="http://forum.interia.pl.*?</a></p>', lambda match: '')
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
link = article.get('link', None)
|
||||||
|
if link and 'galerie' not in link and link.split('/')[-1]=="story01.htm":
|
||||||
|
link=link.split('/')[-2]
|
||||||
|
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||||
|
'0D': '?', '0E': '-', '0H': ',', '0I': '_', '0N': '.com', '0L': 'http://'}
|
||||||
|
for k, v in encoding.iteritems():
|
||||||
|
link = link.replace(k, v)
|
||||||
|
return link
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
chunks = url.split(',')
|
||||||
|
return chunks[0] + '/podglad-wydruku'+ ',' + ','.join(chunks[1:])
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1 { font-size:130% }
|
||||||
|
div.info { font-style:italic; font-size:70%}
|
||||||
|
'''
|
||||||
|
@ -1,65 +1,62 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns"
|
__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns, 2013 Tom Scholl"
|
||||||
'''
|
'''
|
||||||
irishtimes.com
|
irishtimes.com
|
||||||
'''
|
'''
|
||||||
import re
|
import urlparse, re
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
class IrishTimes(BasicNewsRecipe):
|
class IrishTimes(BasicNewsRecipe):
|
||||||
title = u'The Irish Times'
|
title = u'The Irish Times'
|
||||||
encoding = 'ISO-8859-15'
|
__author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns, Tom Scholl"
|
||||||
__author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns"
|
|
||||||
language = 'en_IE'
|
language = 'en_IE'
|
||||||
timefmt = ' (%A, %B %d, %Y)'
|
|
||||||
|
|
||||||
|
masthead_url = 'http://www.irishtimes.com/assets/images/generic/website/logo_theirishtimes.png'
|
||||||
|
|
||||||
|
encoding = 'utf-8'
|
||||||
oldest_article = 1.0
|
oldest_article = 1.0
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
remove_empty_feeds = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
simultaneous_downloads= 5
|
temp_files = []
|
||||||
|
articles_are_obfuscated = True
|
||||||
r = re.compile('.*(?P<url>http:\/\/(www.irishtimes.com)|(rss.feedsportal.com\/c)\/.*\.html?).*')
|
|
||||||
remove_tags = [dict(name='div', attrs={'class':'footer'})]
|
|
||||||
extra_css = 'p, div { margin: 0pt; border: 0pt; text-indent: 0.5em } .headline {font-size: large;} \n .fact { padding-top: 10pt }'
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
|
('News', 'http://www.irishtimes.com/cmlink/the-irish-times-news-1.1319192'),
|
||||||
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
|
('World', 'http://www.irishtimes.com/cmlink/irishtimesworldfeed-1.1321046'),
|
||||||
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
|
('Politics', 'http://www.irishtimes.com/cmlink/irish-times-politics-rss-1.1315953'),
|
||||||
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
|
('Business', 'http://www.irishtimes.com/cmlink/the-irish-times-business-1.1319195'),
|
||||||
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
|
('Culture', 'http://www.irishtimes.com/cmlink/the-irish-times-culture-1.1319213'),
|
||||||
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
|
('Sport', 'http://www.irishtimes.com/cmlink/the-irish-times-sport-1.1319194'),
|
||||||
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
|
('Debate', 'http://www.irishtimes.com/cmlink/debate-1.1319211'),
|
||||||
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
|
('Life & Style', 'http://www.irishtimes.com/cmlink/the-irish-times-life-style-1.1319214'),
|
||||||
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
|
|
||||||
('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
|
|
||||||
('Education & Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
|
|
||||||
('Motors', 'http://www.irishtimes.com/feeds/rss/newspaper/motors.rss'),
|
|
||||||
('An Teanga Bheo', 'http://www.irishtimes.com/feeds/rss/newspaper/anteangabheo.rss'),
|
|
||||||
('Commercial Property', 'http://www.irishtimes.com/feeds/rss/newspaper/commercialproperty.rss'),
|
|
||||||
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
|
|
||||||
('Property', 'http://www.irishtimes.com/feeds/rss/newspaper/property.rss'),
|
|
||||||
('The Tickets', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
|
|
||||||
('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
|
|
||||||
('News features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
|
|
||||||
('Obituaries', 'http://www.irishtimes.com/feeds/rss/newspaper/obituaries.rss'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def get_obfuscated_article(self, url):
|
||||||
if url.count('rss.feedsportal.com'):
|
# Insert a pic from the original url, but use content from the print url
|
||||||
#u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm')
|
pic = None
|
||||||
u = url.find('irishtimes')
|
pics = self.index_to_soup(url)
|
||||||
u = 'http://www.irishtimes.com' + url[u + 12:]
|
div = pics.find('div', {'class' : re.compile('image-carousel')})
|
||||||
u = u.replace('0C', '/')
|
if div:
|
||||||
u = u.replace('A', '')
|
pic = div.img
|
||||||
u = u.replace('0Bhtml/story01.htm', '_pf.html')
|
if pic:
|
||||||
else:
|
try:
|
||||||
u = url.replace('.html','_pf.html')
|
pic['src'] = urlparse.urljoin(url, pic['src'])
|
||||||
return u
|
pic.extract()
|
||||||
|
except:
|
||||||
|
pic = None
|
||||||
|
|
||||||
|
content = self.index_to_soup(url + '?mode=print&ot=example.AjaxPageLayout.ot')
|
||||||
|
if pic:
|
||||||
|
content.p.insert(0, pic)
|
||||||
|
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
|
self.temp_files[-1].write(content.prettify())
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
return self.temp_files[-1].name
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
return article.link
|
|
||||||
|
|
||||||
|
@ -11,12 +11,10 @@ class AdvancedUserRecipe1295262156(BasicNewsRecipe):
|
|||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
encoding='iso-8859-1'
|
encoding='iso-8859-1'
|
||||||
|
|
||||||
|
|
||||||
feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]
|
feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')]
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url+"&print=yes"
|
return url+"/print/yes"
|
||||||
|
|
||||||
extra_css = 'td.textb {font-size: medium;}'
|
extra_css = 'td.textb {font-size: medium;}'
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class KDEFamilyPl(BasicNewsRecipe):
|
class KDEFamilyPl(BasicNewsRecipe):
|
||||||
@ -9,6 +10,7 @@ class KDEFamilyPl(BasicNewsRecipe):
|
|||||||
cover_url = 'http://www.mykde.home.pl/kdefamily/wp-content/uploads/2012/07/logotype-e1341585198616.jpg'
|
cover_url = 'http://www.mykde.home.pl/kdefamily/wp-content/uploads/2012/07/logotype-e1341585198616.jpg'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
preprocess_regexps = [(re.compile(r"Podobne wpisy.*", re.IGNORECASE|re.DOTALL), lambda m: '')]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = True
|
use_embedded_content = True
|
||||||
feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')]
|
feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')]
|
36
recipes/km_blog.recipe
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>, Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
korwin-mikke.pl/blog
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class km_blog(BasicNewsRecipe):
|
||||||
|
title = u'Korwin-Mikke Blog'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description ='Wiadomości z bloga korwin-mikke.pl/blog'
|
||||||
|
INDEX='http://korwin-mikke.pl/blog'
|
||||||
|
remove_empty_feeds= True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
feeds = [(u'blog', u'http://korwin-mikke.pl/blog/rss')]
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
#this line should show title of the article, but it doesnt work
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'posts view'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'text'}))
|
||||||
|
keep_only_tags.append(dict(name = 'h1'))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'p', attrs = {'class' : 'float_right'}))
|
||||||
|
remove_tags.append(dict(name = 'p', attrs = {'class' : 'date'}))
|
||||||
|
|
||||||
|
remove_tags_after=[(dict(name = 'div', attrs = {'class': 'text'}))]
|
@ -25,6 +25,7 @@ class Konflikty(BasicNewsRecipe):
|
|||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
for image in soup.findAll(name='a', attrs={'class':'image'}):
|
for image in soup.findAll(name='a', attrs={'class':'image'}):
|
||||||
|
image['style'] = 'width: 210px; float: left; margin-right:5px;'
|
||||||
if image.img and image.img.has_key('alt'):
|
if image.img and image.img.has_key('alt'):
|
||||||
image.name='div'
|
image.name='div'
|
||||||
pos = len(image.contents)
|
pos = len(image.contents)
|
||||||
|
@ -8,16 +8,22 @@ class Kosmonauta(BasicNewsRecipe):
|
|||||||
category = 'astronomy'
|
category = 'astronomy'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
|
cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
|
||||||
|
extra_css = '.thumbnail {float:left;margin-right:5px;}'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
INDEX = 'http://www.kosmonauta.net'
|
INDEX = 'http://www.kosmonauta.net'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style']
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'item-page'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'item-page'})]
|
||||||
remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']})]
|
remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']}), dict(attrs={'alt':['Poprzednia strona', 'Następna strona']})]
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'cedtag'})
|
remove_tags_after = dict(name='div', attrs={'class':'cedtag'})
|
||||||
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')]
|
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?tmpl=component&print=1&layout=default&page='
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup.findAll(name='a'):
|
for a in soup.findAll(name='a'):
|
||||||
if a.has_key('href'):
|
if a.has_key('href'):
|
||||||
@ -25,4 +31,3 @@ class Kosmonauta(BasicNewsRecipe):
|
|||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
a['href'] = self.INDEX + href
|
a['href'] = self.INDEX + href
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs, Comment
|
||||||
|
|
||||||
class KurierGalicyjski(BasicNewsRecipe):
|
class KurierGalicyjski(BasicNewsRecipe):
|
||||||
title = u'Kurier Galicyjski'
|
title = u'Kurier Galicyjski'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -42,6 +43,9 @@ class KurierGalicyjski(BasicNewsRecipe):
|
|||||||
r.extract()
|
r.extract()
|
||||||
for r in appendtag.findAll(attrs={'style':'border-top-width: thin; border-top-style: dashed; border-top-color: #CCC; border-bottom-width: thin; border-bottom-style: dashed; border-bottom-color: #CCC; padding-top:5px; padding-bottom:5px; text-align:right; margin-top:10px; height:20px;'}):
|
for r in appendtag.findAll(attrs={'style':'border-top-width: thin; border-top-style: dashed; border-top-color: #CCC; border-bottom-width: thin; border-bottom-style: dashed; border-bottom-color: #CCC; padding-top:5px; padding-bottom:5px; text-align:right; margin-top:10px; height:20px;'}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class KurierPoranny(BasicNewsRecipe):
|
class KurierPoranny(BasicNewsRecipe):
|
||||||
title = u'Kurier Poranny'
|
title = u'Kurier Poranny'
|
||||||
@ -73,6 +74,11 @@ class KurierPoranny(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
|||||||
__author__ = 'Lorenzo Vigentini and Olivier Daigle'
|
__author__ = 'Lorenzo Vigentini and Olivier Daigle'
|
||||||
__copyright__ = '2012, Lorenzo Vigentini <l.vigentini at gmail.com>, Olivier Daigle <odaigle _at nuvucameras __dot__ com>'
|
__copyright__ = '2012, Lorenzo Vigentini <l.vigentini at gmail.com>, Olivier Daigle <odaigle _at nuvucameras __dot__ com>'
|
||||||
__version__ = 'v1.01'
|
__version__ = 'v1.01'
|
||||||
__date__ = '22, December 2012'
|
__date__ = '17, March 2013'
|
||||||
__description__ = 'Canadian Paper '
|
__description__ = 'Canadian Paper '
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -28,10 +28,14 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
|
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 200
|
max_articles_per_feed = 200
|
||||||
|
min_articles_per_feed = 0
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 10
|
recursion = 10
|
||||||
needs_subscription = 'optional'
|
needs_subscription = 'optional'
|
||||||
|
|
||||||
|
compress_news_images = True
|
||||||
|
compress_news_images_auto_size = 4
|
||||||
|
|
||||||
filterDuplicates = False
|
filterDuplicates = False
|
||||||
url_list = []
|
url_list = []
|
||||||
|
|
||||||
@ -66,16 +70,16 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
|
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
|
||||||
# (u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
|
(u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
|
||||||
# (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
|
(u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
|
||||||
# (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
|
(u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
|
||||||
# (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
|
(u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
|
||||||
# (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'),
|
(u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'),
|
||||||
# (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'),
|
(u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'),
|
||||||
# (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'),
|
(u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'),
|
||||||
# (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
|
(u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
|
||||||
# (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
|
(u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
|
||||||
# (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
|
(u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
|
||||||
(u'Art de vivre', 'http://www.ledevoir.com/rss/section/art-de-vivre.xml?id=50')
|
(u'Art de vivre', 'http://www.ledevoir.com/rss/section/art-de-vivre.xml?id=50')
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -113,3 +117,23 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
self.url_list.append(url)
|
self.url_list.append(url)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
'''
|
||||||
|
def postprocess_html(self, soup, first):
|
||||||
|
#process all the images. assumes that the new html has the correct path
|
||||||
|
if first == 0:
|
||||||
|
return soup
|
||||||
|
|
||||||
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
|
iurl = tag['src']
|
||||||
|
img = Image()
|
||||||
|
img.open(iurl)
|
||||||
|
# width, height = img.size
|
||||||
|
# print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
|
||||||
|
if img < 0:
|
||||||
|
raise RuntimeError('Out of memory')
|
||||||
|
img.set_compression_quality(30)
|
||||||
|
img.save(iurl)
|
||||||
|
return soup
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class LinuxJournal(BasicNewsRecipe):
|
class LinuxJournal(BasicNewsRecipe):
|
||||||
title = u'Linux Journal'
|
title = u'Linux Journal'
|
||||||
@ -25,6 +26,9 @@ class LinuxJournal(BasicNewsRecipe):
|
|||||||
soup2 = self.index_to_soup('http://www.linuxjournal.com'+ nexturl)
|
soup2 = self.index_to_soup('http://www.linuxjournal.com'+ nexturl)
|
||||||
pagetext = soup2.find(attrs={'class':'node-inner'}).find(attrs={'class':'content'})
|
pagetext = soup2.find(attrs={'class':'node-inner'}).find(attrs={'class':'content'})
|
||||||
next = appendtag.find('li', attrs={'class':'pager-next'})
|
next = appendtag.find('li', attrs={'class':'pager-next'})
|
||||||
|
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
tag = appendtag.find('div', attrs={'class':'links'})
|
tag = appendtag.find('div', attrs={'class':'links'})
|
||||||
|
@ -1,13 +0,0 @@
|
|||||||
from calibre.web.feeds.news import CalibrePeriodical
|
|
||||||
|
|
||||||
class MiDDay(CalibrePeriodical):
|
|
||||||
|
|
||||||
title = 'MiDDay'
|
|
||||||
calibre_periodicals_slug = 'midday'
|
|
||||||
|
|
||||||
description = '''Get your dose of the latest news, views and fun - from the
|
|
||||||
world of politics, sports and Bollywood to the cartoons, comics and games of
|
|
||||||
the entertainment section - India’s leading tabloid has it all. To subscribe
|
|
||||||
visit <a href="http://news.calibre-ebook.com/periodical/midday">calibre
|
|
||||||
Periodicals</a>.'''
|
|
||||||
language = 'en_IN'
|
|
@ -9,6 +9,7 @@ class Mlody_technik(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
#cover_url = 'http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
|
#cover_url = 'http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
extra_css = 'img.alignleft {float: left; margin-right: 5px;}'
|
||||||
preprocess_regexps = [(re.compile(r"<h4>Podobne</h4>", re.IGNORECASE), lambda m: '')]
|
preprocess_regexps = [(re.compile(r"<h4>Podobne</h4>", re.IGNORECASE), lambda m: '')]
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
@ -11,6 +11,8 @@ class NaukawPolsce(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
extra_css = '.miniaturka {float: left; margin-right: 5px; max-width: 350px;} .miniaturka-dol-strony {display: inline-block; margin: 0 15px; width: 120px;}'
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
index = 'http://www.naukawpolsce.pl'
|
index = 'http://www.naukawpolsce.pl'
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})]
|
||||||
remove_tags = [dict(name='div', attrs={'class':'tagi'})]
|
remove_tags = [dict(name='div', attrs={'class':'tagi'})]
|
||||||
|
@ -7,8 +7,11 @@ class Niebezpiecznik_pl(BasicNewsRecipe):
|
|||||||
category = 'hacking, IT'
|
category = 'hacking, IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
|
extra_css = '.entry {margin-top: 25px;}'
|
||||||
|
remove_attrs = ['style']
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png'
|
cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png'
|
||||||
remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})]
|
remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})]
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})]
|
keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class NTO(BasicNewsRecipe):
|
class NTO(BasicNewsRecipe):
|
||||||
title = u'Nowa Trybuna Opolska'
|
title = u'Nowa Trybuna Opolska'
|
||||||
@ -58,6 +59,10 @@ class NTO(BasicNewsRecipe):
|
|||||||
pos = len(appendtag.contents)
|
pos = len(appendtag.contents)
|
||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
|
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
@ -35,7 +35,10 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
|
|||||||
continue
|
continue
|
||||||
if x['class'] in {'story', 'ledeStory'}:
|
if x['class'] in {'story', 'ledeStory'}:
|
||||||
tt = 'h3' if x['class'] == 'story' else 'h1'
|
tt = 'h3' if x['class'] == 'story' else 'h1'
|
||||||
|
try:
|
||||||
a = x.find(tt).find('a', href=True)
|
a = x.find(tt).find('a', href=True)
|
||||||
|
except AttributeError:
|
||||||
|
continue
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a['href'] + '&pagewanted=all'
|
url = a['href'] + '&pagewanted=all'
|
||||||
self.log('\tFound article:', title, url)
|
self.log('\tFound article:', title, url)
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class OCLab(BasicNewsRecipe):
|
class OCLab(BasicNewsRecipe):
|
||||||
title = u'OCLab.pl'
|
title = u'OCLab.pl'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -26,6 +28,10 @@ class OCLab(BasicNewsRecipe):
|
|||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}):
|
for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
||||||
|
41
recipes/optyczne_pl.recipe
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class OptyczneRecipe(BasicNewsRecipe):
|
||||||
|
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
|
||||||
|
title = u'optyczne.pl'
|
||||||
|
category = u'News'
|
||||||
|
description = u'Najlepsze testy obiektywów, testy aparatów cyfrowych i testy lornetek w sieci!'
|
||||||
|
cover_url=''
|
||||||
|
remove_empty_feeds= True
|
||||||
|
no_stylesheets=True
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100000
|
||||||
|
recursions = 0
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'center'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'news_foto'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'align' : 'right'}))
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
body {font-family: Arial,Helvetica,sans-serif;}
|
||||||
|
h1{text-align: left;}
|
||||||
|
h2{font-size: medium; font-weight: bold;}
|
||||||
|
p.lead {font-weight: bold; text-align: left;}
|
||||||
|
.authordate {font-size: small; color: #696969;}
|
||||||
|
.fot{font-size: x-small; color: #666666;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
('Aktualnosci', 'http://www.optyczne.pl/rss.xml'),
|
||||||
|
]
|
@ -6,6 +6,7 @@ class OSWorld(BasicNewsRecipe):
|
|||||||
category = 'OS, IT, open source, Linux'
|
category = 'OS, IT, open source, Linux'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png'
|
cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png'
|
||||||
|
extra_css = 'img.alignleft {float: left; margin-right: 5px;}'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Overclock_pl(BasicNewsRecipe):
|
class Overclock_pl(BasicNewsRecipe):
|
||||||
title = u'Overclock.pl'
|
title = u'Overclock.pl'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
@ -11,6 +11,7 @@ class PC_Centre(BasicNewsRecipe):
|
|||||||
cover_url= 'http://pccentre.pl/views/images/logo.gif'
|
cover_url= 'http://pccentre.pl/views/images/logo.gif'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
#keep_only_tags= [dict(id='content')]
|
#keep_only_tags= [dict(id='content')]
|
||||||
#remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
|
#remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
|
||||||
remove_tags=[dict(attrs={'class':'logo_print'})]
|
remove_tags=[dict(attrs={'class':'logo_print'})]
|
||||||
|
@ -1,4 +1,8 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
|
#currently recipe is not working
|
||||||
|
|
||||||
class PC_Foster(BasicNewsRecipe):
|
class PC_Foster(BasicNewsRecipe):
|
||||||
title = u'PC Foster'
|
title = u'PC Foster'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -29,6 +33,9 @@ class PC_Foster(BasicNewsRecipe):
|
|||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
for r in appendtag.findAll(attrs={'class':'review_content double'}):
|
for r in appendtag.findAll(attrs={'class':'review_content double'}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
|
@ -67,6 +67,7 @@ class PsychologyToday(BasicNewsRecipe):
|
|||||||
title = title + u' (%s)'%author
|
title = title + u' (%s)'%author
|
||||||
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
|
article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
|
||||||
print_page=article_page.find('li', attrs={'class':'print_html first'})
|
print_page=article_page.find('li', attrs={'class':'print_html first'})
|
||||||
|
if print_page is not None:
|
||||||
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
|
url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
|
||||||
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
|
desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
|
||||||
self.log('Found article:', title)
|
self.log('Found article:', title)
|
||||||
|
@ -23,8 +23,8 @@ class PublicoPT(BasicNewsRecipe):
|
|||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
|
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
|
||||||
|
|
||||||
keep_only_tags = [dict(attrs={'class':['content-noticia-title','artigoHeader','ECOSFERA_MANCHETE','noticia','textoPrincipal','ECOSFERA_texto_01']})]
|
keep_only_tags = [dict(attrs={'class':['hentry article single']})]
|
||||||
remove_tags = [dict(attrs={'class':['options','subcoluna']})]
|
remove_tags = [dict(attrs={'class':['entry-options entry-options-above group','entry-options entry-options-below group', 'module tag-list']})]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Geral', u'http://feeds.feedburner.com/publicoRSS'),
|
(u'Geral', u'http://feeds.feedburner.com/publicoRSS'),
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Comment
|
||||||
|
|
||||||
class PurePC(BasicNewsRecipe):
|
class PurePC(BasicNewsRecipe):
|
||||||
title = u'PurePC'
|
title = u'PurePC'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -27,6 +29,9 @@ class PurePC(BasicNewsRecipe):
|
|||||||
appendtag.insert(pos, pagetext)
|
appendtag.insert(pos, pagetext)
|
||||||
for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}):
|
for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}):
|
||||||
r.extract()
|
r.extract()
|
||||||
|
comments = appendtag.findAll(text=lambda text:isinstance(text, Comment))
|
||||||
|
for comment in comments:
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
self.append_page(soup, soup.body)
|
||||||
|
@ -6,10 +6,12 @@ class RTE(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
__author__ = u'Robin Phillips'
|
__author__ = u'Robin Phillips'
|
||||||
language = 'en_IE'
|
language = 'en_IE'
|
||||||
|
auto_cleanup=True
|
||||||
|
auto_cleanup_keep = '//figure[@class="photography gal642 single"]'
|
||||||
|
|
||||||
remove_tags = [dict(attrs={'class':['topAd','botad','previousNextItem','headline','footerLinks','footernav']})]
|
remove_tags = [dict(attrs={'class':['topAd','botad','previousNextItem','headline','footerLinks','footernav']})]
|
||||||
|
|
||||||
feeds = [(u'News', u'http://www.rte.ie/rss/news.xml'), (u'Sport', u'http://www.rte.ie/rss/sport.xml'), (u'Soccer', u'http://www.rte.ie/rss/soccer.xml'), (u'GAA', u'http://www.rte.ie/rss/gaa.xml'), (u'Rugby', u'http://www.rte.ie/rss/rugby.xml'), (u'Racing', u'http://www.rte.ie/rss/racing.xml'), (u'Business', u'http://www.rte.ie/rss/business.xml'), (u'Entertainment', u'http://www.rte.ie/rss/entertainment.xml')]
|
feeds = [(u'News', u'http://www.rte.ie/rss/news.xml'), (u'Sport', u'http://www.rte.ie/rss/sport.xml'), (u'Soccer', u'http://www.rte.ie/rss/soccer.xml'), (u'GAA', u'http://www.rte.ie/rss/gaa.xml'), (u'Rugby', u'http://www.rte.ie/rss/rugby.xml'), (u'Racing', u'http://www.rte.ie/rss/racing.xml'), (u'Business', u'http://www.rte.ie/rss/business.xml'), (u'Entertainment', u'http://www.rte.ie/rss/entertainment.xml')]
|
||||||
|
|
||||||
def print_version(self, url):
|
#def print_version(self, url):
|
||||||
return url.replace('http://www', 'http://m')
|
#return url.replace('http://www', 'http://m')
|
||||||
|
71
recipes/sport_pl.recipe
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = 'teepel 2012'
|
||||||
|
|
||||||
|
'''
|
||||||
|
sport.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class sport_pl(BasicNewsRecipe):
|
||||||
|
title = 'Sport.pl'
|
||||||
|
__author__ = 'teepel <teepel44@gmail.com>'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'Największy portal sportowy w Polsce. Wiadomości sportowe z najważniejszych wydarzeń, relacje i wyniki meczów na żywo.'
|
||||||
|
masthead_url='http://press.gazeta.pl/file/mediakit/154509/c8/sportpl.jpg'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'a', attrs = {'href' : 'www.gazeta.pl'}))
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Wszystkie wiadomości', u'http://rss.gazeta.pl/pub/rss/sport.xml'),
|
||||||
|
(u'Piłka nożna', u'http://www.sport.pl/pub/rss/sport/pilka_nozna.htm'),
|
||||||
|
(u'F1', u'http://www.sport.pl/pub/rss/sportf1.htm'),
|
||||||
|
(u'Tenis', u'http://serwisy.gazeta.pl/pub/rss/tenis.htm'),
|
||||||
|
(u'Siatkówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611628/index.rss'),
|
||||||
|
(u'Koszykówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611647/index.rss'),
|
||||||
|
(u'Piłka ręczna', u'http://gazeta.pl.feedsportal.com/c/32739/f/611635/index.rss'),
|
||||||
|
(u'Inne sporty', u'http://gazeta.pl.feedsportal.com/c/32739/f/611649/index.rss'),
|
||||||
|
]
|
||||||
|
def parse_feeds(self):
|
||||||
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
|
for feed in feeds:
|
||||||
|
for article in feed.articles[:]:
|
||||||
|
if '[ZDJĘCIA]' in article.title:
|
||||||
|
article.title = article.title.replace('[ZDJĘCIA]','')
|
||||||
|
elif '[WIDEO]' in article.title:
|
||||||
|
article.title = article.title.replace('[WIDEO]','')
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
if 'feedsportal' in url:
|
||||||
|
segment = url.split('/')
|
||||||
|
urlPart = segment[-2]
|
||||||
|
urlPart = urlPart.replace('0L0Ssport0Bpl0C','')
|
||||||
|
urlPart = urlPart.replace('0C10H','/')
|
||||||
|
urlPart = urlPart.replace('0H',',')
|
||||||
|
urlPart = urlPart.replace('0I','_')
|
||||||
|
urlPart = urlPart.replace('A','')
|
||||||
|
segment1 = urlPart.split('/')
|
||||||
|
seg1 = segment1[0]
|
||||||
|
seg2 = segment1[1]
|
||||||
|
segment2 = seg2.split(',')
|
||||||
|
part = segment2[0] + ',' + segment2[1]
|
||||||
|
return 'http://www.sport.pl/' + seg1 + '/2029020,' + part + '.html'
|
||||||
|
else:
|
||||||
|
segment = url.split('/')
|
||||||
|
part2 = segment[-2]
|
||||||
|
part1 = segment[-1]
|
||||||
|
segment2 = part1.split(',')
|
||||||
|
part = segment2[1] + ',' + segment2[2]
|
||||||
|
return 'http://www.sport.pl/' + part2 + '/2029020,' + part + '.html'
|
@ -8,11 +8,13 @@ class Tablety_pl(BasicNewsRecipe):
|
|||||||
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
|
||||||
category = 'IT'
|
category = 'IT'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
use_embedded_content=True
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
|
preprocess_regexps = [(re.compile(ur'<p><strong>Przeczytaj także.*?</a></strong></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><strong>Przeczytaj koniecznie.*?</a></strong></p>', re.DOTALL), lambda match: '')]
|
||||||
|
keep_only_tags = [dict(id='news_block')]
|
||||||
#remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
|
#remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
|
||||||
#remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'})
|
#remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'})
|
||||||
#remove_tags=[dict(name='footer', attrs={'class':'entry-footer clearfix'}), dict(name='div', attrs={'class':'entry-comment-counter'})]
|
remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments']})]
|
||||||
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]
|
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]
|
26
recipes/trystero.recipe
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = u'2013, Tomasz Dlugosz <tomek3d@gmail.com>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
trystero.pl
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class trystero(BasicNewsRecipe):
|
||||||
|
title = 'Trystero'
|
||||||
|
__author__ = u'Tomasz D\u0142ugosz'
|
||||||
|
language = 'pl'
|
||||||
|
description =u'Trystero.pl jest niezależnym blogiem finansowym. Publikowane na nim teksty dotyczą rynku kapitałowego, ekonomii, gospodarki i życia społecznego – w takiej mniej więcej kolejności.'
|
||||||
|
oldest_article = 7
|
||||||
|
remove_javascript=True
|
||||||
|
no_stylesheets=True
|
||||||
|
|
||||||
|
feeds = [(u'Newsy', u'http://www.trystero.pl/feed')]
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h1'),
|
||||||
|
dict(name='div', attrs={'class': ['post-content']})]
|
||||||
|
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class UbuntuPomoc(BasicNewsRecipe):
|
class UbuntuPomoc(BasicNewsRecipe):
|
||||||
title = u'Ubuntu-pomoc.org'
|
title = u'Ubuntu-pomoc.org'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -15,8 +16,8 @@ class UbuntuPomoc(BasicNewsRecipe):
|
|||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_attrs = ['style']
|
remove_attrs = ['style']
|
||||||
keep_only_tags = [dict(attrs={'class':'post'})]
|
keep_only_tags = [dict(name='article')]
|
||||||
remove_tags_after = dict(attrs={'class':'underEntry'})
|
#remove_tags_after = dict(attrs={'class':'underEntry'})
|
||||||
remove_tags = [dict(attrs={'class':['underPostTitle', 'yarpp-related', 'underEntry', 'social', 'tags', 'commentlist', 'youtube_sc']}), dict(id=['wp_rp_first', 'commentReply'])]
|
remove_tags = [dict(attrs={'class':['yarpp-related', 'youtube_sc', 'share']}), dict(name='footer')]
|
||||||
feeds = [(u'Ca\u0142o\u015b\u0107', u'http://feeds.feedburner.com/Ubuntu-Pomoc'),
|
feeds = [(u'Ca\u0142o\u015b\u0107', u'http://feeds.feedburner.com/Ubuntu-Pomoc'),
|
||||||
(u'Gry', u'http://feeds.feedburner.com/GryUbuntu-pomoc')]
|
]
|
||||||
|
28
recipes/websecurity_pl.recipe
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class WebSecurity(BasicNewsRecipe):
|
||||||
|
title = u'WebSecurity'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'WebSecurity.pl to największy w Polsce portal o bezpieczeństwie sieciowym.'
|
||||||
|
category = ''
|
||||||
|
#publication_type = ''
|
||||||
|
language = 'pl'
|
||||||
|
#encoding = ''
|
||||||
|
#extra_css = ''
|
||||||
|
cover_url = 'http://websecurity.pl/images/websecurity-logo.png'
|
||||||
|
masthead_url = ''
|
||||||
|
use_embedded_content = False
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs={'class':'article single'}), dict(id='content')]
|
||||||
|
remove_tags = [dict(attrs={'class':['sociable', 'no-comments']})]
|
||||||
|
remove_tags_after = dict(attrs={'class':'sociable'})
|
||||||
|
feeds = [(u'Wszystkie', u'http://websecurity.pl/feed/'), (u'Aktualno\u015bci', u'http://websecurity.pl/aktualnosci/feed/'), (u'Artyku\u0142y', u'http://websecurity.pl/artykuly/feed/'), (u'Blogosfera', u'http://websecurity.pl/blogosfera/wpisy/feed/')]
|
||||||
|
|
@ -8,9 +8,11 @@ class WirtualneMedia(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
|
extra_css = '.thumbnail {float:left; max-width:150px; margin-right:5px;}'
|
||||||
description = u'Portal o mediach, reklamie, internecie, PR, telekomunikacji - nr 1 w Polsce - WirtualneMedia.pl - wiadomości z pierwszej ręki.'
|
description = u'Portal o mediach, reklamie, internecie, PR, telekomunikacji - nr 1 w Polsce - WirtualneMedia.pl - wiadomości z pierwszej ręki.'
|
||||||
category = 'internet'
|
category = 'internet'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
masthead_url= 'http://i.wp.pl/a/f/jpeg/8654/wirtualnemedia.jpeg'
|
masthead_url= 'http://i.wp.pl/a/f/jpeg/8654/wirtualnemedia.jpeg'
|
||||||
cover_url= 'http://static.wirtualnemedia.pl/img/logo_wirtualnemedia_newsletter.gif'
|
cover_url= 'http://static.wirtualnemedia.pl/img/logo_wirtualnemedia_newsletter.gif'
|
||||||
remove_tags=[dict(id=['header', 'footer'])]
|
remove_tags=[dict(id=['header', 'footer'])]
|
||||||
@ -23,8 +25,6 @@ class WirtualneMedia(BasicNewsRecipe):
|
|||||||
(u'Reklama', u'http://www.wirtualnemedia.pl/rss/wm_reklama.xml'),
|
(u'Reklama', u'http://www.wirtualnemedia.pl/rss/wm_reklama.xml'),
|
||||||
(u'PR', u'http://www.wirtualnemedia.pl/rss/wm_relations.xml'),
|
(u'PR', u'http://www.wirtualnemedia.pl/rss/wm_relations.xml'),
|
||||||
(u'Technologie', u'http://www.wirtualnemedia.pl/rss/wm_telekomunikacja.xml'),
|
(u'Technologie', u'http://www.wirtualnemedia.pl/rss/wm_telekomunikacja.xml'),
|
||||||
(u'Telewizja', u'http://www.wirtualnemedia.pl/rss/wm_telewizja_rss.xml')
|
(u'Telewizja', u'http://www.wirtualnemedia.pl/rss/wm_telewizja_rss.xml')]
|
||||||
]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url.replace('artykul', 'print')
|
return url.replace('artykul', 'print')
|
@ -1,5 +1,6 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class ZTS(BasicNewsRecipe):
|
class ZTS(BasicNewsRecipe):
|
||||||
title = u'Zaufana Trzecia Strona'
|
title = u'Zaufana Trzecia Strona'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
@ -7,6 +8,7 @@ class ZTS(BasicNewsRecipe):
|
|||||||
category = 'IT, security'
|
category = 'IT, security'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url = 'http://www.zaufanatrzeciastrona.pl/wp-content/uploads/2012/08/z3s_h100.png'
|
cover_url = 'http://www.zaufanatrzeciastrona.pl/wp-content/uploads/2012/08/z3s_h100.png'
|
||||||
|
extra_css = '.thumbnail {float: left; margin-right:5px;}'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
2650
setup/iso_639/ca.po
2655
setup/iso_639/de.po
@ -9,14 +9,14 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||||
"PO-Revision-Date: 2013-02-26 12:21+0000\n"
|
"PO-Revision-Date: 2013-03-19 21:03+0000\n"
|
||||||
"Last-Translator: Miguel Angel del Olmo <silinio45@gmail.com>\n"
|
"Last-Translator: Jorge Luis Granda <costeelation@hotmail.com>\n"
|
||||||
"Language-Team: Español; Castellano <>\n"
|
"Language-Team: Español; Castellano <>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2013-02-27 04:37+0000\n"
|
"X-Launchpad-Export-Date: 2013-03-20 04:42+0000\n"
|
||||||
"X-Generator: Launchpad (build 16506)\n"
|
"X-Generator: Launchpad (build 16532)\n"
|
||||||
|
|
||||||
#. name for aaa
|
#. name for aaa
|
||||||
msgid "Ghotuo"
|
msgid "Ghotuo"
|
||||||
@ -9808,7 +9808,7 @@ msgstr "Huave; San Mateo Del Mar"
|
|||||||
|
|
||||||
#. name for huw
|
#. name for huw
|
||||||
msgid "Hukumina"
|
msgid "Hukumina"
|
||||||
msgstr ""
|
msgstr "Hukumina"
|
||||||
|
|
||||||
#. name for hux
|
#. name for hux
|
||||||
msgid "Huitoto; Nüpode"
|
msgid "Huitoto; Nüpode"
|
||||||
@ -9816,15 +9816,15 @@ msgstr "Huitoto; Nipode"
|
|||||||
|
|
||||||
#. name for huy
|
#. name for huy
|
||||||
msgid "Hulaulá"
|
msgid "Hulaulá"
|
||||||
msgstr ""
|
msgstr "Hulaulá"
|
||||||
|
|
||||||
#. name for huz
|
#. name for huz
|
||||||
msgid "Hunzib"
|
msgid "Hunzib"
|
||||||
msgstr ""
|
msgstr "Hunzib"
|
||||||
|
|
||||||
#. name for hvc
|
#. name for hvc
|
||||||
msgid "Haitian Vodoun Culture Language"
|
msgid "Haitian Vodoun Culture Language"
|
||||||
msgstr ""
|
msgstr "Idioma de la cultura haitiana vodoun"
|
||||||
|
|
||||||
#. name for hve
|
#. name for hve
|
||||||
msgid "Huave; San Dionisio Del Mar"
|
msgid "Huave; San Dionisio Del Mar"
|
||||||
@ -9832,11 +9832,11 @@ msgstr "Huave; San Dionisio Del Mar"
|
|||||||
|
|
||||||
#. name for hvk
|
#. name for hvk
|
||||||
msgid "Haveke"
|
msgid "Haveke"
|
||||||
msgstr ""
|
msgstr "Haveke"
|
||||||
|
|
||||||
#. name for hvn
|
#. name for hvn
|
||||||
msgid "Sabu"
|
msgid "Sabu"
|
||||||
msgstr ""
|
msgstr "Sabu"
|
||||||
|
|
||||||
#. name for hvv
|
#. name for hvv
|
||||||
msgid "Huave; Santa María Del Mar"
|
msgid "Huave; Santa María Del Mar"
|
||||||
@ -9844,7 +9844,7 @@ msgstr "Huave; Santa María Del Mar"
|
|||||||
|
|
||||||
#. name for hwa
|
#. name for hwa
|
||||||
msgid "Wané"
|
msgid "Wané"
|
||||||
msgstr ""
|
msgstr "Wané"
|
||||||
|
|
||||||
#. name for hwc
|
#. name for hwc
|
||||||
msgid "Creole English; Hawai'i"
|
msgid "Creole English; Hawai'i"
|
||||||
@ -9856,7 +9856,7 @@ msgstr ""
|
|||||||
|
|
||||||
#. name for hya
|
#. name for hya
|
||||||
msgid "Hya"
|
msgid "Hya"
|
||||||
msgstr ""
|
msgstr "Hya"
|
||||||
|
|
||||||
#. name for hye
|
#. name for hye
|
||||||
msgid "Armenian"
|
msgid "Armenian"
|
||||||
@ -9864,7 +9864,7 @@ msgstr "Armenio"
|
|||||||
|
|
||||||
#. name for iai
|
#. name for iai
|
||||||
msgid "Iaai"
|
msgid "Iaai"
|
||||||
msgstr ""
|
msgstr "Iaai"
|
||||||
|
|
||||||
#. name for ian
|
#. name for ian
|
||||||
msgid "Iatmul"
|
msgid "Iatmul"
|
||||||
@ -30664,31 +30664,31 @@ msgstr ""
|
|||||||
|
|
||||||
#. name for zpu
|
#. name for zpu
|
||||||
msgid "Zapotec; Yalálag"
|
msgid "Zapotec; Yalálag"
|
||||||
msgstr ""
|
msgstr "Zapotec; Yalálag"
|
||||||
|
|
||||||
#. name for zpv
|
#. name for zpv
|
||||||
msgid "Zapotec; Chichicapan"
|
msgid "Zapotec; Chichicapan"
|
||||||
msgstr ""
|
msgstr "Zapotec; Chichicapan"
|
||||||
|
|
||||||
#. name for zpw
|
#. name for zpw
|
||||||
msgid "Zapotec; Zaniza"
|
msgid "Zapotec; Zaniza"
|
||||||
msgstr ""
|
msgstr "Zapotec; Zaniza"
|
||||||
|
|
||||||
#. name for zpx
|
#. name for zpx
|
||||||
msgid "Zapotec; San Baltazar Loxicha"
|
msgid "Zapotec; San Baltazar Loxicha"
|
||||||
msgstr ""
|
msgstr "Zapotec; San Baltazar Loxicha"
|
||||||
|
|
||||||
#. name for zpy
|
#. name for zpy
|
||||||
msgid "Zapotec; Mazaltepec"
|
msgid "Zapotec; Mazaltepec"
|
||||||
msgstr ""
|
msgstr "Zapotec; Mazaltepec"
|
||||||
|
|
||||||
#. name for zpz
|
#. name for zpz
|
||||||
msgid "Zapotec; Texmelucan"
|
msgid "Zapotec; Texmelucan"
|
||||||
msgstr ""
|
msgstr "Zapotec; Texmelucan"
|
||||||
|
|
||||||
#. name for zqe
|
#. name for zqe
|
||||||
msgid "Zhuang; Qiubei"
|
msgid "Zhuang; Qiubei"
|
||||||
msgstr ""
|
msgstr "Zhuang; Qiubei"
|
||||||
|
|
||||||
#. name for zra
|
#. name for zra
|
||||||
msgid "Kara (Korea)"
|
msgid "Kara (Korea)"
|
||||||
@ -30732,7 +30732,7 @@ msgstr "Malayo estándar"
|
|||||||
|
|
||||||
#. name for zsr
|
#. name for zsr
|
||||||
msgid "Zapotec; Southern Rincon"
|
msgid "Zapotec; Southern Rincon"
|
||||||
msgstr ""
|
msgstr "Zapotec; Southern Rincon"
|
||||||
|
|
||||||
#. name for zsu
|
#. name for zsu
|
||||||
msgid "Sukurum"
|
msgid "Sukurum"
|
||||||
@ -30760,11 +30760,11 @@ msgstr "Zapoteco de Santa Catarina Albarradas"
|
|||||||
|
|
||||||
#. name for ztp
|
#. name for ztp
|
||||||
msgid "Zapotec; Loxicha"
|
msgid "Zapotec; Loxicha"
|
||||||
msgstr ""
|
msgstr "Zapotec; Loxicha"
|
||||||
|
|
||||||
#. name for ztq
|
#. name for ztq
|
||||||
msgid "Zapotec; Quioquitani-Quierí"
|
msgid "Zapotec; Quioquitani-Quierí"
|
||||||
msgstr ""
|
msgstr "Zapotec; Quioquitani-Quierí"
|
||||||
|
|
||||||
#. name for zts
|
#. name for zts
|
||||||
msgid "Zapotec; Tilquiapan"
|
msgid "Zapotec; Tilquiapan"
|
||||||
|
@ -12,14 +12,14 @@ msgstr ""
|
|||||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||||
"devel@lists.alioth.debian.org>\n"
|
"devel@lists.alioth.debian.org>\n"
|
||||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||||
"PO-Revision-Date: 2013-02-04 07:01+0000\n"
|
"PO-Revision-Date: 2013-03-16 14:32+0000\n"
|
||||||
"Last-Translator: drMerry <Unknown>\n"
|
"Last-Translator: drMerry <Unknown>\n"
|
||||||
"Language-Team: Dutch <vertaling@vrijschrift.org>\n"
|
"Language-Team: Dutch <vertaling@vrijschrift.org>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2013-02-05 04:44+0000\n"
|
"X-Launchpad-Export-Date: 2013-03-17 04:58+0000\n"
|
||||||
"X-Generator: Launchpad (build 16468)\n"
|
"X-Generator: Launchpad (build 16532)\n"
|
||||||
"Language: nl\n"
|
"Language: nl\n"
|
||||||
|
|
||||||
#. name for aaa
|
#. name for aaa
|
||||||
@ -340,7 +340,7 @@ msgstr "Adi"
|
|||||||
|
|
||||||
#. name for adj
|
#. name for adj
|
||||||
msgid "Adioukrou"
|
msgid "Adioukrou"
|
||||||
msgstr ""
|
msgstr "Adiokrou"
|
||||||
|
|
||||||
#. name for adl
|
#. name for adl
|
||||||
msgid "Galo"
|
msgid "Galo"
|
||||||
@ -352,11 +352,11 @@ msgstr "Adang"
|
|||||||
|
|
||||||
#. name for ado
|
#. name for ado
|
||||||
msgid "Abu"
|
msgid "Abu"
|
||||||
msgstr ""
|
msgstr "Abu"
|
||||||
|
|
||||||
#. name for adp
|
#. name for adp
|
||||||
msgid "Adap"
|
msgid "Adap"
|
||||||
msgstr ""
|
msgstr "Adap"
|
||||||
|
|
||||||
#. name for adq
|
#. name for adq
|
||||||
msgid "Adangbe"
|
msgid "Adangbe"
|
||||||
@ -372,7 +372,7 @@ msgstr "Adamorobe gebarentaal"
|
|||||||
|
|
||||||
#. name for adt
|
#. name for adt
|
||||||
msgid "Adnyamathanha"
|
msgid "Adnyamathanha"
|
||||||
msgstr ""
|
msgstr "Adnyamathanha"
|
||||||
|
|
||||||
#. name for adu
|
#. name for adu
|
||||||
msgid "Aduge"
|
msgid "Aduge"
|
||||||
@ -392,7 +392,7 @@ msgstr "Adyghe"
|
|||||||
|
|
||||||
#. name for adz
|
#. name for adz
|
||||||
msgid "Adzera"
|
msgid "Adzera"
|
||||||
msgstr ""
|
msgstr "Adzera"
|
||||||
|
|
||||||
#. name for aea
|
#. name for aea
|
||||||
msgid "Areba"
|
msgid "Areba"
|
||||||
@ -416,11 +416,11 @@ msgstr "Pashai; noordoost"
|
|||||||
|
|
||||||
#. name for aek
|
#. name for aek
|
||||||
msgid "Haeke"
|
msgid "Haeke"
|
||||||
msgstr ""
|
msgstr "Haeke"
|
||||||
|
|
||||||
#. name for ael
|
#. name for ael
|
||||||
msgid "Ambele"
|
msgid "Ambele"
|
||||||
msgstr ""
|
msgstr "Ambele"
|
||||||
|
|
||||||
#. name for aem
|
#. name for aem
|
||||||
msgid "Arem"
|
msgid "Arem"
|
||||||
@ -432,7 +432,7 @@ msgstr "Armeense gebarentaal"
|
|||||||
|
|
||||||
#. name for aeq
|
#. name for aeq
|
||||||
msgid "Aer"
|
msgid "Aer"
|
||||||
msgstr ""
|
msgstr "Aer"
|
||||||
|
|
||||||
#. name for aer
|
#. name for aer
|
||||||
msgid "Arrernte; Eastern"
|
msgid "Arrernte; Eastern"
|
||||||
@ -440,7 +440,7 @@ msgstr "Arrernte; oostelijk"
|
|||||||
|
|
||||||
#. name for aes
|
#. name for aes
|
||||||
msgid "Alsea"
|
msgid "Alsea"
|
||||||
msgstr ""
|
msgstr "Alsea"
|
||||||
|
|
||||||
#. name for aeu
|
#. name for aeu
|
||||||
msgid "Akeu"
|
msgid "Akeu"
|
||||||
@ -468,7 +468,7 @@ msgstr "Andai"
|
|||||||
|
|
||||||
#. name for afe
|
#. name for afe
|
||||||
msgid "Putukwam"
|
msgid "Putukwam"
|
||||||
msgstr ""
|
msgstr "Putukwam"
|
||||||
|
|
||||||
#. name for afg
|
#. name for afg
|
||||||
msgid "Afghan Sign Language"
|
msgid "Afghan Sign Language"
|
||||||
|
@ -13,14 +13,14 @@ msgstr ""
|
|||||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||||
"devel@lists.alioth.debian.org>\n"
|
"devel@lists.alioth.debian.org>\n"
|
||||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||||
"PO-Revision-Date: 2013-02-21 23:51+0000\n"
|
"PO-Revision-Date: 2013-03-23 10:17+0000\n"
|
||||||
"Last-Translator: Глория Хрусталёва <gloriya@hushmail.com>\n"
|
"Last-Translator: Глория Хрусталёва <gloriya@hushmail.com>\n"
|
||||||
"Language-Team: Russian <debian-l10n-russian@lists.debian.org>\n"
|
"Language-Team: Russian <debian-l10n-russian@lists.debian.org>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2013-02-23 05:19+0000\n"
|
"X-Launchpad-Export-Date: 2013-03-24 04:45+0000\n"
|
||||||
"X-Generator: Launchpad (build 16506)\n"
|
"X-Generator: Launchpad (build 16540)\n"
|
||||||
"Language: ru\n"
|
"Language: ru\n"
|
||||||
|
|
||||||
#. name for aaa
|
#. name for aaa
|
||||||
@ -5381,7 +5381,7 @@ msgstr ""
|
|||||||
|
|
||||||
#. name for cof
|
#. name for cof
|
||||||
msgid "Colorado"
|
msgid "Colorado"
|
||||||
msgstr ""
|
msgstr "Колорадо"
|
||||||
|
|
||||||
#. name for cog
|
#. name for cog
|
||||||
msgid "Chong"
|
msgid "Chong"
|
||||||
@ -5505,7 +5505,7 @@ msgstr ""
|
|||||||
|
|
||||||
#. name for cqu
|
#. name for cqu
|
||||||
msgid "Quechua; Chilean"
|
msgid "Quechua; Chilean"
|
||||||
msgstr ""
|
msgstr "Кечуа; Чилийский"
|
||||||
|
|
||||||
#. name for cra
|
#. name for cra
|
||||||
msgid "Chara"
|
msgid "Chara"
|
||||||
|