diff --git a/Changelog.yaml b/Changelog.yaml index 99bcfcfeb0..fcaea34696 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,149 @@ # new recipes: # - title: +- version: 0.8.63 + date: 2012-08-02 + + new features: + - title: "E-book viewer: Allow quick saving and loading of viewer settings as 'themes'." + tickets: [1024611] + + - title: "Ebook-viewer: Add a restore defaults button to the viewer preferences dialog" + + - title: "E-book viewer: Add simple settings for text and background colors" + + - title: "Add an entry to save to disk when right clicking a format in the book details panel" + + - title: "ODT metadata: Read first image as the metadata cover from ODT files. Also allow ODT authors to set custom properties for extended metadata." + + - title: "E-book viewer and PDF Output: Resize images that are longer than the page to fit onto a single page" + + bug fixes: + - title: "KF8 Output: Fix bug where some calibre generated KF8 files would cause the Amazon KF8 viewer on the Touch to go to into an infinite loop when using the next page function" + tickets: [1026421] + + - title: "News download: Add support for tags that link to SVG images." + tickets: [1031553] + + - title: "Update podofo to 0.9.1 in all binary builds, to fix corruption of some PDFs when updating metadata." + tickets: [1031086] + + - title: "Catalog generation: Handle authors whose last name is a number." + + - title: "KF8 Input: Handle html entities in the NCX toc entries correctly" + + - title: "Fix a calibre crash that affected some windows installs" + tickets: [1030234] + + - title: "MOBI Output: Normalize unicode strings before writing to file, to workaround lack of support for non-normal unicode in Amazon's MOBI renderer." + tickets: [1029825] + + - title: "EPUB Input: Handle files that have duplicate entries in the spine" + + - title: "Fix regression in Kobo driver that caused the on device column to not be updated after deleting books" + + new recipes: + - title: Dziennik Polski + author: Gregorz Maj + + - title: High Country Blogs + author: Armin Geller + + - title: Philosophy Now + author: Rick Shang + +- version: 0.8.62 + date: 2012-07-27 + + new features: + - title: "Book details panel: Allow right clicking on a format to delete it." + + - title: "When errors occur in lots of background jobs, add an option to the error message to temporarily suppress subsequent error messages." + tickets: [886904] + + - title: "E-book viewer full screen mode: Allow clicking in the left and right page margins to turn pages." + tickets: [1024819] + + - title: "Drivers for various Android devices" + tickets: [1028690,1027431] + + - title: "Advanced search dialog: When starting on the title/author/etc. tab, restore the previously used search kind as well." + tickets: [1029745] + + - title: "When presenting the calibre must be restarted warning after installing a new plugin, add a restart now button so that the user can conveniently restart calibre. Currently only works when going vie Preferences->Plugins->Get new plugins" + + bug fixes: + - title: "Fix main window layout state being saved incorrectly if calibre is killed without a proper shutdown" + + - title: "Fix boolean and date searching in non english calibre installs." + + - title: "Conversion: Ignore invalid chapter detection and level n ToC expressions instead of erroring out" + + improved recipes: + - Psychology Today + - The Smithsonian + - The New Republic + - Various updated Polish news sources + - The Sun + - San Francisco Bay Guardian + - AnandTech + - Smashing Magazine + + new recipes: + - title: Linux Journal and Conowego.pl + author: fenuks + + - title: A list apart and .net magazine + author: Marc Busque + +- version: 0.8.61 + date: 2012-07-20 + + new features: + - title: "E-book viewer: Add a paged mode that splits up the text into pages, like in a paper book instead of presenting it as a single column. To activate click the button with the yellow scroll icon in the top right corner." + type: major + description: "In paged mode, the ebook viewer no longer cuts off the last line of text at the bottom of the screen, and it respects CSS page-break directives. You can also set page margins and control the number of pages displayed on screen by clicking the Preferences button in the viewer and going to 'Text layout in paged mode'." + + - title: "Digitally sign the calibre OS X and windows builds" + + - title: "Get Books: Add Mills and Boon UK" + + - title: "Various minor improvements to the Bulk metadata edit dialog" + tickets: [1025825, 1025838, 1025628] + + - title: "Fix various regression in the auto-complete functionality for authors/series/tags etc introduced in 0.8.60" + + - title: "Drivers for various new Android devices" + tickets: [1024934] + + - title: "MOBI: Add support for the new language EXTH header field in MOBI files generated by kindlegen 2.5" + + bug fixes: + - title: "KF8 Output: Fix calibre produced KF8 files not showing the 'Use publisher font' option on the Kindle Touch when they have embedded fonts" + + - title: "Txt/fb2/rtf/pml/rb output: Fix non-visibile element's tail text (which should be visible) is being ignored when it shouldn't." + tickets: [1026541] + + - title: "Book details panel: When displaying a link to amazon, use a country specific name like amazon.fr instead of using amazon.com for all countries" + + - title: "Conversion: When splitting on page breaks, ignore page-breaks with values of auto and inherit. " + tickets: [1018875] + + - title: "Metadata jacket: Specify foreground in addition to the background color for the title banner so that it remain readable if the user tries to monkey with the CSS in the viewer." + + - title: "PDF Output: Fix rendering of cover as first age of PDF (ignore margins so that the image covers the entire page)" + + - title: "Linux binaries: Bundle libglib to avoid incompatibilities with glib on various distros." + tickets: [1022019] + + - title: "Fix find_identical_books() choking on books with too many authors" + + + improved recipes: + - Toronto Star + - American Prospect + - faz.net + - version: 0.8.60 date: 2012-07-13 diff --git a/manual/conversion.rst b/manual/conversion.rst index 5eaca5a469..feae2a4273 100644 --- a/manual/conversion.rst +++ b/manual/conversion.rst @@ -710,3 +710,35 @@ EPUB from the ZIP file are:: Note that because this file explores the potential of EPUB, most of the advanced formatting is not going to work on readers less capable than |app|'s built-in EPUB viewer. + +Convert ODT documents +~~~~~~~~~~~~~~~~~~~~~ + +|app| can directly convert ODT (OpenDocument Text) files. You should use styles to format your document and minimize the use of direct formatting. +When inserting images into your document you need to anchor them to the paragraph, images anchored to a page will all end up in the front of the conversion. + +To enable automatic detection of chapters, you need to mark them with the build-in styles called 'Heading 1', 'Heading 2', ..., 'Heading 6' ('Heading 1' equates to the HTML tag

, 'Heading 2' to

etc). When you convert in |app| you can enter which style you used into the 'Detect chapters at' box. Example: + + * If you mark Chapters with style 'Heading 2', you have to set the 'Detect chapters at' box to ``//h:h2`` + * For a nested TOC with Sections marked with 'Heading 2' and the Chapters marked with 'Heading 3' you need to enter ``//h:h2|//h:h3``. On the Convert - TOC page set the 'Level 1 TOC' box to ``//h:h2`` and the 'Level 2 TOC' box to ``//h:h3``. + +Well-known document properties (Title, Keywords, Description, Creator) are recognized and |app| will use the first image (not to small, and with good aspect-ratio) as the cover image. + +There is also an advanced property conversion mode, which is activated by setting the custom property ``opf.metadata`` ('Yes or No' type) to Yes in your ODT document (File->Properties->Custom Properties). +If this property is detected by |app|, the following custom properties are recognized (``opf.authors`` overrides document creator):: + + opf.titlesort + opf.authors + opf.authorsort + opf.publisher + opf.pubdate + opf.isbn + opf.language + opf.series + opf.seriesindex + +In addition to this, you can specify the picture to use as the cover by naming it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no picture with this name is found, the 'smart' method is used. +As the cover detection might result in double covers in certain output formats, the process will remove the paragraph (only if the only content is the cover!) from the document. But this works only with the named picture! + +To disable cover detection you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes in advanced mode. + diff --git a/manual/develop.rst b/manual/develop.rst old mode 100755 new mode 100644 index 12bbcefe57..d59c315951 --- a/manual/develop.rst +++ b/manual/develop.rst @@ -6,9 +6,9 @@ Setting up a |app| development environment =========================================== |app| is completely open source, licensed under the `GNU GPL v3 `_. -This means that you are free to download and modify the program to your heart's content. In this section, -you will learn how to get a |app| development environment set up on the operating system of your choice. -|app| is written primarily in `Python `_ with some C/C++ code for speed and system interfacing. +This means that you are free to download and modify the program to your heart's content. In this section, +you will learn how to get a |app| development environment set up on the operating system of your choice. +|app| is written primarily in `Python `_ with some C/C++ code for speed and system interfacing. Note that |app| is not compatible with Python 3 and requires at least Python 2.7. .. contents:: Contents @@ -20,14 +20,14 @@ Design philosophy |app| has its roots in the Unix world, which means that its design is highly modular. The modules interact with each other via well defined interfaces. This makes adding new features and fixing -bugs in |app| very easy, resulting in a frenetic pace of development. Because of its roots, |app| has a +bugs in |app| very easy, resulting in a frenetic pace of development. Because of its roots, |app| has a comprehensive command line interface for all its functions, documented in :ref:`cli`. The modular design of |app| is expressed via ``Plugins``. There is a :ref:`tutorial ` on writing |app| plugins. For example, adding support for a new device to |app| typically involves writing less than a 100 lines of code in the form of -a device driver plugin. You can browse the -`built-in drivers `_. Similarly, adding support -for new conversion formats involves writing input/output format plugins. Another example of the modular design is the :ref:`recipe system ` for +a device driver plugin. You can browse the +`built-in drivers `_. Similarly, adding support +for new conversion formats involves writing input/output format plugins. Another example of the modular design is the :ref:`recipe system ` for fetching news. For more examples of plugins designed to add features to |app|, see the `plugin index `_. Code layout @@ -91,15 +91,15 @@ this, make your changes, then run:: This will create a :file:`my-changes` file in the current directory, simply attach that to a ticket on the |app| `bug tracker `_. -If you plan to do a lot of development on |app|, then the best method is to create a +If you plan to do a lot of development on |app|, then the best method is to create a `Launchpad `_ account. Once you have an account, you can use it to register your bzr branch created by the `bzr branch` command above. First run the following command to tell bzr about your launchpad account:: bzr launchpad-login your_launchpad_username -Now, you have to setup SSH access to Launchpad. First create an SSH public/private keypair. Then upload -the public key to Launchpad by going to your Launchpad account page. Instructions for setting up the +Now, you have to setup SSH access to Launchpad. First create an SSH public/private keypair. Then upload +the public key to Launchpad by going to your Launchpad account page. Instructions for setting up the private key in bzr are at http://bazaar-vcs.org/Bzr_and_SSH. Now you can upload your branch to the |app| project in Launchpad by following the instructions at https://help.launchpad.net/Code/UploadingABranch. Whenever you commit changes to your branch with the command:: @@ -108,7 +108,7 @@ Whenever you commit changes to your branch with the command:: Kovid can merge it directly from your branch into the main |app| source tree. You should also keep an eye on the |app| `development forum `. Before making major changes, you should -discuss them in the forum or contact Kovid directly (his email address is all over the source code). +discuss them in the forum or contact Kovid directly (his email address is all over the source code). Windows development environment --------------------------------- @@ -118,12 +118,12 @@ the previously checked out |app| code directory. For example:: cd C:\Users\kovid\work\calibre -calibre is the directory that contains the src and resources sub-directories. +calibre is the directory that contains the src and resources sub-directories. The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory. So, following the example above, it would be ``C:\Users\kovid\work\calibre\src``. `Here is a short guide `_ to setting environment -variables on Windows. +variables on Windows. Once you have set the environment variable, open a new command prompt and check that it was correctly set by using the command:: @@ -134,7 +134,7 @@ Setting this environment variable means that |app| will now load all its Python That's it! You are now ready to start hacking on the |app| code. For example, open the file :file:`src\\calibre\\__init__.py` in your favorite editor and add the line:: - + print ("Hello, world!") near the top of the file. Now run the command :command:`calibredb`. The very first line of output should be ``Hello, world!``. @@ -149,24 +149,25 @@ the previously checked out |app| code directory, for example:: calibre is the directory that contains the src and resources sub-directories. Ensure you have installed the |app| commandline tools via :guilabel:`Preferences->Advanced->Miscellaneous` in the |app| GUI. -The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory. -So, following the example above, it would be ``/Users/kovid/work/calibre/src``. Apple -`documentation `_ -on how to set environment variables. +The next step is to create a bash script that will set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory when running calibre in debug mode. -Once you have set the environment variable, open a new Terminal and check that it was correctly set by using -the command:: +Create a plain text file:: - echo $CALIBRE_DEVELOP_FROM + #!/bin/sh + export CALIBRE_DEVELOP_FROM="/Users/kovid/work/calibre/src" + calibre-debug -g -Setting this environment variable means that |app| will now load all its Python code from the specified location. +Save this file as ``/usr/bin/calibre-develop``, then set its permissions so that it can be executed:: -That's it! You are now ready to start hacking on the |app| code. For example, open the file :file:`src/calibre/__init__.py` -in your favorite editor and add the line:: - - print ("Hello, world!") + chmod +x /usr/bin/calibre-develop -near the top of the file. Now run the command :command:`calibredb`. The very first line of output should be ``Hello, world!``. +Once you have done this, run:: + + calibre-develop + +You should see some diagnostic information in the Terminal window as calibre +starts up, and you should see an asterisk after the version number in the GUI +window, indicating that you are running from source. Linux development environment ------------------------------ @@ -181,11 +182,11 @@ Install the |app| using the binary installer. Then open a terminal and change to cd /home/kovid/work/calibre -calibre is the directory that contains the src and resources sub-directories. +calibre is the directory that contains the src and resources sub-directories. The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory. So, following the example above, it would be ``/home/kovid/work/calibre/src``. How to set environment variables depends on -your Linux distribution and what shell you are using. +your Linux distribution and what shell you are using. Once you have set the environment variable, open a new terminal and check that it was correctly set by using the command:: @@ -196,7 +197,7 @@ Setting this environment variable means that |app| will now load all its Python That's it! You are now ready to start hacking on the |app| code. For example, open the file :file:`src/calibre/__init__.py` in your favorite editor and add the line:: - + print ("Hello, world!") near the top of the file. Now run the command :command:`calibredb`. The very first line of output should be ``Hello, world!``. diff --git a/manual/news.rst b/manual/news.rst index 873025d467..9783a262aa 100755 --- a/manual/news.rst +++ b/manual/news.rst @@ -30,7 +30,7 @@ Lets pick a couple of feeds that look interesting: #. Business Travel: http://feeds.portfolio.com/portfolio/businesstravel #. Tech Observer: http://feeds.portfolio.com/portfolio/thetechobserver -I got the URLs by clicking the little orange RSS icon next to each feed name. To make |app| download the feeds and convert them into an ebook, you should click the :guilabel:`Fetch news` button and then the :guilabel:`Add a custom news source` menu item. A dialog similar to that shown below should open up. +I got the URLs by clicking the little orange RSS icon next to each feed name. To make |app| download the feeds and convert them into an ebook, you should right click the :guilabel:`Fetch news` button and then the :guilabel:`Add a custom news source` menu item. A dialog similar to that shown below should open up. .. image:: images/custom_news.png :align: center diff --git a/recipes/anandtech.recipe b/recipes/anandtech.recipe index aa10084070..ff08c828ac 100644 --- a/recipes/anandtech.recipe +++ b/recipes/anandtech.recipe @@ -21,8 +21,12 @@ class anan(BasicNewsRecipe): remove_javascript = True encoding = 'utf-8' - remove_tags=[dict(name='a', attrs={'style':'width:110px; margin-top:0px;text-align:center;'}), - dict(name='a', attrs={'style':'width:110px; margin-top:0px; margin-right:20px;text-align:center;'})] + remove_tags=[ + dict(name='a', attrs={'style':'width:110px; margin-top:0px;text-align:center;'}), + dict(name='a', attrs={'style':'width:110px; margin-top:0px; margin-right:20px;text-align:center;'}), + {'attrs':{'class':['article_links', 'header', 'body_right']}}, + {'id':['crumbs']}, + ] feeds = [ ('Anandtech', 'http://www.anandtech.com/rss/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 00eea1be68..9544abdfcf 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -1,6 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe import re -class Benchmark_pl(BasicNewsRecipe): +class BenchmarkPl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' @@ -14,7 +14,7 @@ class Benchmark_pl(BasicNewsRecipe): preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/conowego_pl.recipe b/recipes/conowego_pl.recipe new file mode 100755 index 0000000000..8b4288ddcd --- /dev/null +++ b/recipes/conowego_pl.recipe @@ -0,0 +1,38 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +class CoNowegoPl(BasicNewsRecipe): + title = u'conowego.pl' + __author__ = 'fenuks' + description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !' + cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png' + category = 'IT, news' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(name='div', attrs={'class':'news_list single_view'})] + remove_tags = [dict(name='div', attrs={'class':['ni_bottom', 'ni_rank', 'ni_date']})] + feeds = [(u'Aktualno\u015bci', u'http://www.conowego.pl/rss/aktualnosci-5/?type=100'), (u'Gaming', u'http://www.conowego.pl/rss/gaming-6/?type=100'), (u'Porady', u'http://www.conowego.pl/rss/porady-3/?type=100'), (u'Testy', u'http://www.conowego.pl/rss/testy-2/?type=100')] + + def preprocess_html(self, soup): + for i in soup.findAll('img'): + i.parent.insert(0, BeautifulSoup('
')) + i.insert(len(i), BeautifulSoup('
')) + self.append_page(soup, soup.body) + return soup + + + def append_page(self, soup, appendtag): + tag = appendtag.find('div', attrs={'class':'pages'}) + if tag: + nexturls=tag.findAll('a') + for nexturl in nexturls[:-1]: + soup2 = self.index_to_soup('http://www.conowego.pl/' + nexturl['href']) + pagetext = soup2.find(attrs={'class':'ni_content'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}): + r.extract() diff --git a/recipes/dot_net.recipe b/recipes/dot_net.recipe new file mode 100644 index 0000000000..50db71e9be --- /dev/null +++ b/recipes/dot_net.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class NetMagazineRecipe (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'.net magazine' + description = u'net is the world’s best-selling magazine for web designers and developers, featuring tutorials from leading agencies, interviews with the web’s biggest names, and agenda-setting features on the hottest issues affecting the internet today.' + language = 'en' + tags = 'web development, software' + oldest_article = 7 + remove_empty_feeds = True + no_stylesheets = True + cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png' + keep_only_tags = [ + dict(name='article', attrs={'class': re.compile('^node.*$', re.IGNORECASE)}) + ] + remove_tags = [ + dict(name='span', attrs={'class': 'comment-count'}), + dict(name='div', attrs={'class': 'item-list share-links'}), + dict(name='footer'), + ] + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height', 'style'] + extra_css = 'img {max-width: 100%; display: block; margin: auto;} .captioned-image div {text-align: center; font-style: italic;}' + + feeds = [ + (u'.net', u'http://feeds.feedburner.com/net/topstories'), + ] diff --git a/recipes/dziennik_polski.recipe b/recipes/dziennik_polski.recipe new file mode 100644 index 0000000000..83b9d06ecd --- /dev/null +++ b/recipes/dziennik_polski.recipe @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +__license__='GPL v3' +__author__='grzegorz.maj@dziennik.krakow.pl>' + +''' +http://dziennikpolski24.pl +Author: grzegorz.maj@dziennik.krakow.pl +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class DziennikPolski24(BasicNewsRecipe): + + title=u'Dziennik Polski' + publisher=u'Grupa Polskapresse' + + __author__='grzegorz.maj' + description=u'Wiadomości z wydania Dziennika Polskiego' + oldest_article=1 + max_articles_per_feed=50 + needs_subscription=True + + remove_javascript=True + no_stylesheets=True + use_embedded_content=False + remove_empty_feeds=True + extra_css='.date{margin-top: 4em;} .logo_author{margin-left:0.5em;}' + + publication_type='newspaper' + cover_url='http://www.dziennikpolski24.pl/_p/images/logoDP24-b.gif' + INDEX='http://dziennikpolski24.pl/' + + encoding='utf-8' + language='pl' + + keep_only_tags=[ + + dict(name = 'div', attrs = {'class':['toolbar']}) + , dict(name = 'h1') + , dict(name = 'h2', attrs = {'class':['teaser']}) + , dict(name = 'div', attrs = {'class':['picture']}) + , dict(name = 'div', attrs = {'id':['showContent']}) + , dict(name = 'div', attrs = {'class':['paging']}) + , dict(name = 'div', attrs = {'class':['wykupTresc']}) + ] + + remove_tags=[ + + ] + + feeds=[ + (u'Kraj', u'http://www.dziennikpolski24.pl/rss/feed/1151') + , (u'Świat', u'http://www.dziennikpolski24.pl/rss/feed/1153') + , (u'Gospodarka', u'http://www.dziennikpolski24.pl/rss/feed/1154') + , (u'Małopolska', u'http://www.dziennikpolski24.pl/rss/feed/1155') + , (u'Kultura', u'http://www.dziennikpolski24.pl/rss/feed/1156') + , (u'Opinie', u'http://www.dziennikpolski24.pl/rss/feed/1158') + , (u'Kronika Nowohucka', u'http://www.dziennikpolski24.pl/rss/feed/1656') + , (u'Na bieżąco', u'http://www.dziennikpolski24.pl/rss/feed/1543') + , (u'Londyn 2012', u'http://www.dziennikpolski24.pl/rss/feed/2545') + , (u'Piłka nożna', u'http://www.dziennikpolski24.pl/rss/feed/2196') + , (u'Siatkówka', u'http://www.dziennikpolski24.pl/rss/feed/2197') + , (u'Koszykówka', u'http://www.dziennikpolski24.pl/rss/feed/2198') + , (u'Tenis', u'http://www.dziennikpolski24.pl/rss/feed/2199') + , (u'Formuła 1', u'http://www.dziennikpolski24.pl/rss/feed/2203') + , (u'Lekkoatletyka', u'http://www.dziennikpolski24.pl/rss/feed/2204') + , (u'Żużel', u'http://www.dziennikpolski24.pl/rss/feed/2200') + , (u'Sporty motorowe', u'http://www.dziennikpolski24.pl/rss/feed/2206') + , (u'Publicystyka sportowa', u'http://www.dziennikpolski24.pl/rss/feed/2201') + , (u'Kolarstwo', u'http://www.dziennikpolski24.pl/rss/feed/2205') + , (u'Inne', u'http://www.dziennikpolski24.pl/rss/feed/2202') + , (u'Miasto Kraków', u'http://www.dziennikpolski24.pl/rss/feed/1784') + , (u'Region nowosądecki', u'http://www.dziennikpolski24.pl/rss/feed/1795') + , (u'Region Małopolski Zachodniej', u'http://www.dziennikpolski24.pl/rss/feed/1793') + , (u'Region tarnowski', u'http://www.dziennikpolski24.pl/rss/feed/1797') + , (u'Region podhalański', u'http://www.dziennikpolski24.pl/rss/feed/1789') + , (u'Region olkuski', u'http://www.dziennikpolski24.pl/rss/feed/1670') + , (u'Region miechowski', u'http://www.dziennikpolski24.pl/rss/feed/1806') + , (u'Region podkrakowski', u'http://www.dziennikpolski24.pl/rss/feed/1787') + , (u'Region proszowicki', u'http://www.dziennikpolski24.pl/rss/feed/1804') + , (u'Region wielicki', u'http://www.dziennikpolski24.pl/rss/feed/1802') + , (u'Region podbeskidzki', u'http://www.dziennikpolski24.pl/rss/feed/1791') + , (u'Region myślenicki', u'http://www.dziennikpolski24.pl/rss/feed/1800') + , (u'Autosalon', u'http://www.dziennikpolski24.pl/rss/feed/1294') + , (u'Kariera', u'http://www.dziennikpolski24.pl/rss/feed/1289') + , (u'Przegląd nieruchomości', u'http://www.dziennikpolski24.pl/rss/feed/1281') + , (u'Magnes', u'http://www.dziennikpolski24.pl/rss/feed/1283') + , (u'Magazyn Piątek', u'http://www.dziennikpolski24.pl/rss/feed/1293') + , (u'Pejzaż rodzinny', u'http://www.dziennikpolski24.pl/rss/feed/1274') + , (u'Podróże', u'http://www.dziennikpolski24.pl/rss/feed/1275') + , (u'Konsument', u'http://www.dziennikpolski24.pl/rss/feed/1288') + ] + + def append_page(self, soup, appendtag): + loop=False + tag=soup.find('div', attrs = {'class':'paging'}) + if tag: + loop=True + li_nks=tag.findAll('li') + appendtag.find('div', attrs = {'class':'paging'}).extract() + if appendtag.find('ul', attrs = {'class':'menuf'}): + appendtag.find('ul', attrs = {'class':'menuf'}).extract() + while loop: + loop=False + for li_nk in li_nks: + link_tag=li_nk.contents[0].contents[0].string + if u'następna' in link_tag: + soup2=self.index_to_soup(self.INDEX+li_nk.contents[0]['href']) + if soup2.find('div', attrs = {'id':'showContent'}): + pagetext=soup2.find('div', attrs = {'id':'showContent'}) + pos=len(appendtag.contents) + appendtag.insert(pos, pagetext) + if soup2.find('div', attrs = {'class':'rightbar'}): + pagecont=soup2.find('div', attrs = {'class':'rightbar'}) + tag=pagecont.find('div', attrs = {'class':'paging'}) + li_nks=tag.findAll('li') + loop=True + + def get_browser(self): + br=BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://www.dziennikpolski24.pl/pl/moje-konto/950606-loguj.html') + br.select_form(nr = 1) + br["user_login[login]"]=self.username + br['user_login[pass]']=self.password + br.submit() + return br + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup + diff --git a/recipes/ekundelek_pl.recipe b/recipes/ekundelek_pl.recipe new file mode 100644 index 0000000000..ebc5d39bbd --- /dev/null +++ b/recipes/ekundelek_pl.recipe @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2012, Artur Stachecki ' + +from calibre.web.feeds.news import BasicNewsRecipe + +class swiatczytnikow(BasicNewsRecipe): + title = u'eKundelek' + description = u'Najsympatyczniejszy blog o e-czytnikach Kindle' + language = 'pl' + __author__ = u'Artur Stachecki' + oldest_article = 7 + max_articles_per_feed = 100 + + remove_tags = [dict(name = 'div', attrs = {'class' : 'feedflare'})] + + feeds = [(u'Wpisy', u'http://feeds.feedburner.com/Ekundelekpl?format=xml')] diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe index 6ee5ae3fb6..84455ddd3c 100644 --- a/recipes/fhm_uk.recipe +++ b/recipes/fhm_uk.recipe @@ -18,15 +18,15 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): keep_only_tags = [ dict(name='h1'), dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}), - dict(name='div',attrs={'id' : ['articleLeft']}), - dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody']}), + dict(name='div',attrs={'id' : ['profileLeft','articleLeft','profileRight','profileBody']}), + dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody',]}), ] - #remove_tags = [ - #dict(attrs={'class' : ['player']}), + remove_tags = [ + dict(attrs={'id' : ['ctl00_Body_divSlideShow' ]}), - #] + ] feeds = [ (u'Homepage 1',u'http://feed43.com/6655867614547036.xml'), (u'Homepage 2',u'http://feed43.com/4167731873103110.xml'), @@ -34,7 +34,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): (u'Homepage 4',u'http://feed43.com/6550421522527341.xml'), (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'), (u'Gaming',u'http://feed43.com/6537162612465672.xml'), - (u'Girls',u'http://feed43.com/3674777224513254.xml'), + (u'Girls',u'http://feed43.com/4574262733341068.xml'),# edit link http://feed43.com/feed.html?name=4574262733341068 ] extra_css = ''' diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 2a6e00d501..ba34c9ff63 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -1,6 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe - -class Filmweb_pl(BasicNewsRecipe): +import re +from calibre.ebooks.BeautifulSoup import BeautifulSoup +class FilmWebPl(BasicNewsRecipe): title = u'FilmWeb' __author__ = 'fenuks' description = 'FilmWeb - biggest polish movie site' @@ -12,8 +13,9 @@ class Filmweb_pl(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets= True remove_empty_feeds=True + preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' - remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] + remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), @@ -31,18 +33,22 @@ class Filmweb_pl(BasicNewsRecipe): (u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'), (u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), - (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')] + (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest') + ] - def skip_ad_pages(self, soup): + def skip_ad_pages(self, soup): skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'}) if skip_tag is not None: - self.log.warn('skip_tag') - self.log.warn(skip_tag) return self.index_to_soup(skip_tag['href'], raw=True) - + def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + for i in soup.findAll('a', attrs={'class':'fn'}): + i.insert(len(i), BeautifulSoup('
')) + for i in soup.findAll('sup'): + if not i.string or i.string.startswith('(kliknij'): + i.extract() + return soup diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index e188e4988c..fce9674081 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -1,6 +1,6 @@ from calibre.web.feeds.recipes import BasicNewsRecipe -class Gry_online_pl(BasicNewsRecipe): +class GryOnlinePl(BasicNewsRecipe): title = u'Gry-Online.pl' __author__ = 'fenuks' description = 'Gry-Online.pl - computer games' @@ -21,17 +21,18 @@ class Gry_online_pl(BasicNewsRecipe): tag = appendtag.find('div', attrs={'class':'n5p'}) if tag: nexturls=tag.findAll('a') - for nexturl in nexturls[1:]: - try: - soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) - except: - soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + url_part = soup.find('link', attrs={'rel':'canonical'})['href'] + url_part = url_part[25:].rpartition('?')[0] + for nexturl in nexturls[1:-1]: + soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href']) pagetext = soup2.find(attrs={'class':'gc660'}) for r in pagetext.findAll(name='header'): r.extract() + for r in pagetext.findAll(attrs={'itemprop':'description'}): + r.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}): r.extract() diff --git a/recipes/high_country_blogs.recipe b/recipes/high_country_blogs.recipe new file mode 100644 index 0000000000..5173c30596 --- /dev/null +++ b/recipes/high_country_blogs.recipe @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal , Armin Geller' + +''' +Fetch High Country News - Blogs +''' +from calibre.web.feeds.news import BasicNewsRecipe +class HighCountryNewsBlogs(BasicNewsRecipe): + + title = u'High Country News - Blogs' + description = u'High Country News - Blogs (RSS Version)' + __author__ = 'Armin Geller' # 2012-08-01 + publisher = 'High Country News' + category = 'news, politics, Germany' + timefmt = ' [%a, %d %b %Y]' + language = 'en' + encoding = 'UTF-8' + publication_type = 'newspaper' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + auto_cleanup = True + remove_javascript = True + use_embedded_content = False + masthead_url = 'http://www.hcn.org/logo.jpg' + cover_source = 'http://www.hcn.org' + + def get_cover_url(self): + cover_source_soup = self.index_to_soup(self.cover_source) + preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'}) + return preview_image_div.div.img['src'] + + feeds = [ + (u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'), + + (u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'), + (u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'), + (u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'), + ] + + def print_version(self, url): + return url + diff --git a/recipes/icons/conowego_pl.png b/recipes/icons/conowego_pl.png new file mode 100644 index 0000000000..3bc8f2c672 Binary files /dev/null and b/recipes/icons/conowego_pl.png differ diff --git a/recipes/icons/dziennik_polski.png b/recipes/icons/dziennik_polski.png new file mode 100644 index 0000000000..d06507eca7 Binary files /dev/null and b/recipes/icons/dziennik_polski.png differ diff --git a/recipes/icons/linux_journal.png b/recipes/icons/linux_journal.png new file mode 100644 index 0000000000..ed0092bd1d Binary files /dev/null and b/recipes/icons/linux_journal.png differ diff --git a/recipes/linux_journal.recipe b/recipes/linux_journal.recipe new file mode 100755 index 0000000000..99b1a570dc --- /dev/null +++ b/recipes/linux_journal.recipe @@ -0,0 +1,36 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class LinuxJournal(BasicNewsRecipe): + title = u'Linux Journal' + __author__ = 'fenuks' + description = u'The monthly magazine of the Linux community, promoting the use of Linux worldwide.' + cover_url = 'http://www.linuxjournal.com/files/linuxjournal.com/ufiles/logo-lj.jpg' + category = 'IT, Linux' + language = 'en' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + keep_only_tags=[dict(id='content-inner')] + remove_tags_after= dict(attrs={'class':'user-signature clear-block'}) + remove_tags=[dict(attrs={'class':['user-signature clear-block', 'breadcrumb', 'terms terms-inline']})] + feeds = [(u'Front Page', u'http://feeds.feedburner.com/linuxjournalcom'), (u'News', u'http://feeds.feedburner.com/LinuxJournal-BreakingNews'), (u'Blogs', u'http://www.linuxjournal.com/blog/feed'), (u'Audio/Video', u'http://www.linuxjournal.com/taxonomy/term/28/0/feed'), (u'Community', u'http://www.linuxjournal.com/taxonomy/term/18/0/feed'), (u'Education', u'http://www.linuxjournal.com/taxonomy/term/25/0/feed'), (u'Embedded', u'http://www.linuxjournal.com/taxonomy/term/27/0/feed'), (u'Hardware', u'http://www.linuxjournal.com/taxonomy/term/23/0/feed'), (u'HOWTOs', u'http://www.linuxjournal.com/taxonomy/term/19/0/feed'), (u'International', u'http://www.linuxjournal.com/taxonomy/term/30/0/feed'), (u'Security', u'http://www.linuxjournal.com/taxonomy/term/31/0/feed'), (u'Software', u'http://www.linuxjournal.com/taxonomy/term/17/0/feed'), (u'Sysadmin', u'http://www.linuxjournal.com/taxonomy/term/21/0/feed'), (u'Webmaster', u'http://www.linuxjournal.com/taxonomy/term/24/0/feed')] + + def append_page(self, soup, appendtag): + next = appendtag.find('li', attrs={'class':'pager-next'}) + while next: + nexturl = next.a['href'] + appendtag.find('div', attrs={'class':'links'}).extract() + soup2 = self.index_to_soup('http://www.linuxjournal.com'+ nexturl) + pagetext = soup2.find(attrs={'class':'node-inner'}).find(attrs={'class':'content'}) + next = appendtag.find('li', attrs={'class':'pager-next'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = appendtag.find('div', attrs={'class':'links'}) + if tag: + tag.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/list_apart.recipe b/recipes/list_apart.recipe new file mode 100644 index 0000000000..35cbaad958 --- /dev/null +++ b/recipes/list_apart.recipe @@ -0,0 +1,33 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from calibre.web.feeds.news import BasicNewsRecipe + +class AListApart (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'A List Apart' + description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.' + language = 'en' + tags = 'web development, software' + oldest_article = 120 + remove_empty_feeds = True + no_stylesheets = True + encoding = 'utf8' + cover_url = u'http://alistapart.com/pix/alalogo.gif' + keep_only_tags = [ + dict(name='div', attrs={'id': 'content'}) + ] + remove_tags = [ + dict(name='ul', attrs={'id': 'metastuff'}), + dict(name='div', attrs={'class': 'discuss'}), + dict(name='div', attrs={'class': 'discuss'}), + dict(name='div', attrs={'id': 'learnmore'}), + ] + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] + extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}' + + feeds = [ + (u'A List Apart', u'http://www.alistapart.com/site/rss'), + ] diff --git a/recipes/metro_uk.recipe b/recipes/metro_uk.recipe index fa5d5c19c8..5b7b3a64ed 100644 --- a/recipes/metro_uk.recipe +++ b/recipes/metro_uk.recipe @@ -1,31 +1,42 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1306097511(BasicNewsRecipe): title = u'Metro UK' - description = 'News as provide by The Metro -UK' + description = 'Author Dave Asbury : News as provide by The Metro -UK' #timefmt = '' __author__ = 'Dave Asbury' - #last update 9/6/12 + #last update 4/8/12 cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg' - #no_stylesheets = True + no_stylesheets = True oldest_article = 1 - max_articles_per_feed = 10 + max_articles_per_feed = 12 remove_empty_feeds = True remove_javascript = True - auto_cleanup = True + #auto_cleanup = True encoding = 'UTF-8' - + cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/157897_117118184990145_840702264_n.jpg' language = 'en_GB' masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif' + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:1.6em;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:1.2em;} + p{font-family:Arial,Helvetica,sans-serif;font-size:1.0em;} + body{font-family:Helvetica,Arial,sans-serif;font-size:1.0em;} + ''' keep_only_tags = [ - - ] + #dict(name='h1'), + #dict(name='h2'), + #dict(name='div', attrs={'class' : ['row','article','img-cnt figure','clrd']}) + #dict(name='h3'), + #dict(attrs={'class' : 'BText'}), + ] remove_tags = [ - + dict(name='span',attrs={'class' : 'share'}), + dict(name='li'), + dict(attrs={'class' : ['twitter-share-button','header-forms','hdr-lnks','close','art-rgt','fd-gr1-b clrd google-article','news m12 clrd clr-b p5t shareBtm','item-ds csl-3-img news','c-1of3 c-last','c-1of1','pd','item-ds csl-3-img sport']}), + dict(attrs={'id' : ['','sky-left','sky-right','ftr-nav','and-ftr','notificationList','logo','miniLogo','comments-news','metro_extras']}) ] - + remove_tags_before = dict(name='h1') + #remove_tags_after = dict(attrs={'id':['topic-buttons']}) feeds = [ (u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')] - extra_css = ''' - body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} - ''' diff --git a/recipes/natemat_pl.recipe b/recipes/natemat_pl.recipe index faa1b341a0..d6db93dad7 100644 --- a/recipes/natemat_pl.recipe +++ b/recipes/natemat_pl.recipe @@ -1,3 +1,4 @@ +import re from calibre.web.feeds.news import BasicNewsRecipe class NaTemat(BasicNewsRecipe): @@ -8,8 +9,9 @@ class NaTemat(BasicNewsRecipe): description = u'informacje, komentarze, opinie' category = 'news' language = 'pl' + preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj więcej\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?', re.IGNORECASE), lambda m: '')] cover_url= 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png' no_stylesheets = True keep_only_tags= [dict(id='main')] - remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related']})] + remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related', 'user-header', 'links']}), dict(name='img', attrs={'class':'indent'})] feeds = [(u'Artyku\u0142y', u'http://natemat.pl/rss/wszystkie')] diff --git a/recipes/phillosophy_now.recipe b/recipes/phillosophy_now.recipe new file mode 100644 index 0000000000..7c12832c70 --- /dev/null +++ b/recipes/phillosophy_now.recipe @@ -0,0 +1,75 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict + +class PhilosophyNow(BasicNewsRecipe): + + title = 'Philosophy Now' + __author__ = 'Rick Shang' + + description = '''Philosophy Now is a lively magazine for everyone + interested in ideas. It isn't afraid to tackle all the major questions of + life, the universe and everything. Published every two months, it tries to + corrupt innocent citizens by convincing them that philosophy can be + exciting, worthwhile and comprehensible, and also to provide some enjoyable + reading matter for those already ensnared by the muse, such as philosophy + students and academics.''' + language = 'en' + category = 'news' + encoding = 'UTF-8' + + keep_only_tags = [dict(attrs={'id':'fullMainColumn'})] + remove_tags = [dict(attrs={'class':'articleTools'})] + no_javascript = True + no_stylesheets = True + needs_subscription = True + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open('https://philosophynow.org/auth/login') + br.select_form(nr = 1) + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + + def parse_index(self): + #Go to the issue + soup0 = self.index_to_soup('http://philosophynow.org/') + issue = soup0.find('div',attrs={'id':'navColumn'}) + + #Find date & cover + cover = issue.find('div', attrs={'id':'cover'}) + date = self.tag_to_string(cover.find('h3')).strip() + self.timefmt = u' [%s]'%date + img=cover.find('img',src=True)['src'] + self.cover_url = 'http://philosophynow.org' + re.sub('medium','large',img) + issuenum = re.sub('/media/images/covers/medium/issue','',img) + issuenum = re.sub('.jpg','',issuenum) + + #Go to the main body + current_issue_url = 'http://philosophynow.org/issues/' + issuenum + soup = self.index_to_soup(current_issue_url) + div = soup.find ('div', attrs={'class':'articlesColumn'}) + + feeds = OrderedDict() + + for post in div.findAll('h3'): + articles = [] + a=post.find('a',href=True) + if a is not None: + url="http://philosophynow.org" + a['href'] + title=self.tag_to_string(a).strip() + s=post.findPrevious('h4') + section_title = self.tag_to_string(s).strip() + d=post.findNext('p') + desc = self.tag_to_string(d).strip() + articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans + diff --git a/recipes/psych.recipe b/recipes/psych.recipe index 3fc940b4a2..a21acefe30 100644 --- a/recipes/psych.recipe +++ b/recipes/psych.recipe @@ -1,44 +1,79 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ptempfile import PersistentTemporaryFile -from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1275708473(BasicNewsRecipe): - title = u'Psychology Today' - _author__ = 'rty' - publisher = u'www.psychologytoday.com' - category = u'Psychology' - max_articles_per_feed = 100 - remove_javascript = True - use_embedded_content = False - no_stylesheets = True +class PsychologyToday(BasicNewsRecipe): + + title = 'Psychology Today' + __author__ = 'Rick Shang' + + description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.' language = 'en' - temp_files = [] - articles_are_obfuscated = True - remove_tags = [ - dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}), - dict(name='span', attrs={'class':'print-footnote'}), - ] - remove_tags_before = dict(name='h1', attrs={'class':'print-title'}) - remove_tags_after = dict(name='div', attrs={'class':['field-items','print-footer']}) + category = 'news' + encoding = 'UTF-8' + keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})] + no_javascript = True + no_stylesheets = True - feeds = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')] - def get_article_url(self, article): - return article.get('link', None) + def parse_index(self): + articles = [] + soup = self.index_to_soup('http://www.psychologytoday.com/magazine') + + + #Go to the main body + div = soup.find('div',attrs={'id':'content-content'}) + #Find cover & date + cover_item = div.find('div', attrs={'class':'collections-header-image'}) + cover = cover_item.find('img',src=True) + self.cover_url = cover['src'] + date = self.tag_to_string(cover['title']) + self.timefmt = u' [%s]'%date + + articles = [] + for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) + title = title + u' (%s)'%author + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + description = post.find('div', attrs={'class':'collection-node-description'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip()) + desc = self.tag_to_string(description).strip() + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + title = title + u' (%s)'%author + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) + title = title + u' (%s)'%author + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + return [('Current Issue', articles)] - def get_obfuscated_article(self, url): - br = self.get_browser() - br.open(url) - response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0) - html = response.read() - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name - def get_cover_url(self): - index = 'http://www.psychologytoday.com/magazine/' - soup = self.index_to_soup(index) - for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }): - return image['src'] + '.jpg' - return None diff --git a/recipes/sfbg.recipe b/recipes/sfbg.recipe index 0735e760c6..5c77c96f74 100644 --- a/recipes/sfbg.recipe +++ b/recipes/sfbg.recipe @@ -1,25 +1,35 @@ from calibre.web.feeds.news import BasicNewsRecipe class SanFranciscoBayGuardian(BasicNewsRecipe): - title = u'San Francisco Bay Guardian' - language = 'en' - __author__ = 'Krittika Goyal' + title = u'San Francisco Bay Guardian' + language = 'en' + __author__ = 'Krittika Goyal' oldest_article = 31 #days max_articles_per_feed = 25 + #encoding = 'latin1' no_stylesheets = True + #remove_tags_before = dict(name='div', attrs={'id':'story_header'}) + #remove_tags_after = dict(name='div', attrs={'id':'shirttail'}) remove_tags = [ - dict(name='iframe'), + dict(name='iframe'), + #dict(name='div', attrs={'class':'related-articles'}), + #dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}), + #dict(name='ul', attrs={'class':'article-tools'}), + #dict(name='ul', attrs={'id':'story_tabs'}), ] feeds = [ ('sfbg', 'http://www.sfbg.com/rss.xml'), - ('politics', 'http://www.sfbg.com/politics/rss.xml'), - ('blogs', 'http://www.sfbg.com/blog/rss.xml'), - ('pixel_vision', 'http://www.sfbg.com/pixel_vision/rss.xml'), - ('bruce', 'http://www.sfbg.com/bruce/rss.xml'), ] - + #def preprocess_html(self, soup): + #story = soup.find(name='div', attrs={'id':'story_body'}) + #td = heading.findParent(name='td') + #td.extract() + #soup = BeautifulSoup('t') + #body = soup.find(name='body') + #body.insert(0, story) + #return soup diff --git a/recipes/smashing.recipe b/recipes/smashing.recipe index 04436a05ef..bc24166275 100644 --- a/recipes/smashing.recipe +++ b/recipes/smashing.recipe @@ -1,50 +1,24 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' -''' -www.smashingmagazine.com -''' - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from calibre.web.feeds.news import BasicNewsRecipe -class SmashingMagazine(BasicNewsRecipe): - title = 'Smashing Magazine' - __author__ = 'Darko Miletic' - description = 'We smash you with the information that will make your life easier, really' - oldest_article = 20 - language = 'en' - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - publisher = 'Smashing Magazine' - category = 'news, web, IT, css, javascript, html' - encoding = 'utf-8' +class SmashingMagazine (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0.1' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'Smashing Magazine' + description = u'Founded in September 2006, Smashing Magazine delivers useful and innovative information to Web designers and developers. Our aim is to inform our readers about the latest trends and techniques in Web development. We try to persuade you not with the quantity but with the quality of the information we present. Smashing Magazine is and always has been independent.' + language = 'en' + tags = 'web development, software' + oldest_article = 7 + remove_empty_feeds = True + no_stylesheets = True + encoding = 'utf8' + cover_url = u'http://media.smashingmagazine.com/themes/smashingv4/images/logo.png' + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height', 'style'] + extra_css = u'body div table:first-child {display: none;} img {max-width: 100%; display: block; margin: auto;}' - conversion_options = { - 'comments' : description - ,'tags' : category - ,'publisher' : publisher - } - - keep_only_tags = [dict(name='div', attrs={'id':'leftcolumn'})] - remove_tags_after = dict(name='ul',attrs={'class':'social'}) - remove_tags = [ - dict(name=['link','object']) - ,dict(name='h1',attrs={'class':'logo'}) - ,dict(name='div',attrs={'id':'booklogosec'}) - ,dict(attrs={'src':'http://media2.smashingmagazine.com/wp-content/uploads/images/the-smashing-book/smbook6.gif'}) - ] - - feeds = [(u'Articles', u'http://rss1.smashingmagazine.com/feed/')] - - def preprocess_html(self, soup): - for iter in soup.findAll('div',attrs={'class':'leftframe'}): - it = iter.find('h1') - if it == None: - iter.extract() - for item in soup.findAll('img'): - oldParent = item.parent - if oldParent.name == 'a': - oldParent.name = 'div' - return soup + feeds = [ + (u'Smashing Magazine', u'http://rss1.smashingmagazine.com/feed/'), + ] diff --git a/recipes/smith.recipe b/recipes/smith.recipe index 8bf60a227a..3d6a95c494 100644 --- a/recipes/smith.recipe +++ b/recipes/smith.recipe @@ -1,61 +1,67 @@ import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict -class SmithsonianMagazine(BasicNewsRecipe): - title = u'Smithsonian Magazine' - language = 'en' - __author__ = 'Krittika Goyal and TerminalVeracity' - oldest_article = 31#days - max_articles_per_feed = 50 - use_embedded_content = False - recursions = 1 - cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg' - match_regexps = ['&page=[2-9]$'] - preprocess_regexps = [ - (re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '') - ] - extra_css = """ - h1{font-size: large; margin: .2em 0} - h2{font-size: medium; margin: .2em 0} - h3{font-size: medium; margin: .2em 0} - #byLine{margin: .2em 0} - .articleImageCaptionwide{font-style: italic} - .wp-caption-text{font-style: italic} - img{display: block} - """ +class Smithsonian(BasicNewsRecipe): + title = 'Smithsonian Magazine' + __author__ = 'Rick Shang' - remove_stylesheets = True - remove_tags_after = dict(name='div', attrs={'class':['post','articlePaginationWrapper']}) - remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}), - dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}), - dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}), - dict(name='h4', attrs={'id':'related-topics'}), - dict(name='table'), - dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}), - dict(name='a', attrs={'name':'comments_shaded'}), - ] + description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' + language = 'en' + category = 'news' + encoding = 'UTF-8' + keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})] + remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})] + no_javascript = True + no_stylesheets = True + def parse_index(self): + #Go to the issue + soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/') + div = soup0.find('div',attrs={'id':'archives'}) + issue = div.find('ul',attrs={'class':'clear-both'}) + current_issue_url = issue.find('a', href=True)['href'] + soup = self.index_to_soup(current_issue_url) - feeds = [ -('History and Archeology', - 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'), -('People and Places', - 'http://feeds.feedburner.com/smithsonianmag/people-places'), -('Science and Nature', - 'http://feeds.feedburner.com/smithsonianmag/science-nature'), -('Arts and Culture', - 'http://feeds.feedburner.com/smithsonianmag/arts-culture'), -('Travel', - 'http://feeds.feedburner.com/smithsonianmag/travel'), -] + #Go to the main body + div = soup.find ('div', attrs={'id':'content-inset'}) + + #Find date + date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip()) + self.timefmt = u' [%s]'%date + + #Find cover + self.cover_url = div.find('img',src=True)['src'] + + feeds = OrderedDict() + section_title = '' + subsection_title = '' + for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}): + articles = [] + prefix = '' + h3=post.find('h3') + if h3 is not None: + section_title = self.tag_to_string(h3) + else: + subsection=post.find('p',attrs={'class':'article-cat'}) + link=post.find('a',href=True) + url=link['href']+'?c=y&story=fullstory' + if subsection is not None: + subsection_title = self.tag_to_string(subsection) + prefix = (subsection_title+': ') + description=self.tag_to_string(post('p', limit=2)[1]).strip() + else: + description=self.tag_to_string(post.find('p')).strip() + desc=re.sub('\sBy\s.*', '', description, re.DOTALL) + author=re.sub('.*By\s', '', description, re.DOTALL) + title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author + articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'id':'article-body'}) - soup = BeautifulSoup('t') - body = soup.find(name='body') - body.insert(0, story) - return soup diff --git a/recipes/sueddeutsche_mobil.recipe b/recipes/sueddeutsche_mobil.recipe new file mode 100644 index 0000000000..d1b08cbcba --- /dev/null +++ b/recipes/sueddeutsche_mobil.recipe @@ -0,0 +1,117 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +__license__ = 'GPL v3' +__copyright__ = '2012, Andreas Zeiser ' +''' +szmobil.sueddeutsche.de/ +''' + +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class SZmobil(BasicNewsRecipe): + title = u'Süddeutsche Zeitung mobil' + __author__ = u'Andreas Zeiser' + description = u'Nachrichten aus Deutschland. Zugriff auf kostenpflichtiges Abo SZ mobil.' + publisher = u'Sueddeutsche Zeitung' + language = u'de' + publication_type = u'newspaper' + category = u'news, politics, Germany' + + no_stylesheets = True + oldest_article = 2 + encoding = 'iso-8859-1' + needs_subscription = True + remove_empty_feeds = True + delay = 1 + cover_source = 'http://www.sueddeutsche.de/verlag' + + timefmt = ' [%a, %d %b, %Y]' + + root_url ='http://szmobil.sueddeutsche.de/' + keep_only_tags = [dict(name='div', attrs={'class':'article'})] + + def get_cover_url(self): + src = self.index_to_soup(self.cover_source) + image_url = src.find(attrs={'class':'preview-image'}) + return image_url.div.img['src'] + + def get_browser(self): + browser = BasicNewsRecipe.get_browser(self) + + # Login via fetching of Streiflicht -> Fill out login request + url = self.root_url + 'show.php?id=streif' + browser.open(url) + + browser.select_form(nr=0) # to select the first form + browser['username'] = self.username + browser['password'] = self.password + browser.submit() + + return browser + + def parse_index(self): + # find all sections + src = self.index_to_soup('http://szmobil.sueddeutsche.de') + feeds = [] + for itt in src.findAll('a',href=True): + if itt['href'].startswith('show.php?section'): + feeds.append( (itt.string[0:-2],itt['href']) ) + + all_articles = [] + for feed in feeds: + feed_url = self.root_url + feed[1] + feed_title = feed[0] + + self.report_progress(0, ('Fetching feed')+' %s...'%(feed_title if feed_title else feed_url)) + + src = self.index_to_soup(feed_url) + articles = [] + shorttitles = dict() + for itt in src.findAll('a', href=True): + if itt['href'].startswith('show.php?id='): + article_url = itt['href'] + article_id = int(re.search("id=(\d*)&etag=", itt['href']).group(1)) + + # first check if link is a special article in section "Meinungsseite" + if itt.find('strong')!= None: + article_name = itt.strong.string + article_shorttitle = itt.contents[1] + + articles.append( (article_name, article_url, article_id) ) + shorttitles[article_id] = article_shorttitle + continue + + + # candidate for a general article + if itt.string == None: + article_name = '' + else: + article_name = itt.string + + if (article_name[0:10] == " mehr"): + # just another link ("mehr") to an article + continue + + if itt.has_key('id'): + shorttitles[article_id] = article_name + else: + articles.append( (article_name, article_url, article_id) ) + + feed_articles = [] + for article_name, article_url, article_id in articles: + url = self.root_url + article_url + title = article_name + pubdate = strftime('%a, %d %b') + description = '' + if shorttitles.has_key(article_id): + description = shorttitles[article_id] + # we do not want the flag ("Impressum") + if "HERAUSGEGEBEN VOM" in description: + continue + d = dict(title=title, url=url, date=pubdate, description=description, content='') + feed_articles.append(d) + all_articles.append( (feed_title, feed_articles) ) + + return all_articles + diff --git a/recipes/the_new_republic.recipe b/recipes/the_new_republic.recipe index 59ccef3607..649a8c46f3 100644 --- a/recipes/the_new_republic.recipe +++ b/recipes/the_new_republic.recipe @@ -1,45 +1,64 @@ -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict -class The_New_Republic(BasicNewsRecipe): - title = 'The New Republic' - __author__ = 'cix3' +class TNR(BasicNewsRecipe): + + title = 'The New Republic' + __author__ = 'Rick Shang' + + description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.' language = 'en' - description = 'Intelligent, stimulating and rigorous examination of American politics, foreign policy and culture' - timefmt = ' [%b %d, %Y]' - - oldest_article = 7 - max_articles_per_feed = 100 + category = 'news' + encoding = 'UTF-8' + remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})] + no_javascript = True no_stylesheets = True - remove_tags = [ - dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}), - dict(name='hr', attrs={'class':'print-hr'}), dict(name='img') - ] - feeds = [ - ('Politics', 'http://www.tnr.com/rss/articles/Politics'), - ('Books and Arts', 'http://www.tnr.com/rss/articles/Books-and-Arts'), - ('Economy', 'http://www.tnr.com/rss/articles/Economy'), - ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'), - ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'), - ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'), - ('World', 'http://www.tnr.com/rss/articles/World'), - ('Film', 'http://www.tnr.com/rss/articles/Film'), - ('Books', 'http://www.tnr.com/rss/articles/books'), - ('The Book', 'http://www.tnr.com/rss/book'), - ('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'), - ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'), - ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'), - ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'), - ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'), - ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'), - ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'), - ('Simon Johnson', 'http://www.tnr.com/rss/blogs/Simon-Johnson'), - ('Ed Kilgore', 'http://www.tnr.com/rss/blogs/Ed-Kilgore'), - ('Damon Linker', 'http://www.tnr.com/rss/blogs/Damon-Linker'), - ('John McWhorter', 'http://www.tnr.com/rss/blogs/John-McWhorter') - ] + def parse_index(self): - def print_version(self, url): - return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/') + #Go to the issue + soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues') + issue = soup0.find('div',attrs={'id':'current_issue'}) + #Find date + date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip() + self.timefmt = u' [%s]'%date + + #Go to the main body + current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href'] + soup = self.index_to_soup(current_issue_url) + div = soup.find ('div', attrs={'class':'article_detail_body'}) + + + + #Find cover + self.cover_url = div.find('img',src=True)['src'] + + feeds = OrderedDict() + section_title = '' + subsection_title = '' + for post in div.findAll('p'): + articles = [] + em=post.find('em') + b=post.find('b') + a=post.find('a',href=True) + if em is not None: + section_title = self.tag_to_string(em).strip() + subsection_title = '' + elif b is not None: + subsection_title=self.tag_to_string(b).strip() + elif a is not None: + prefix = (subsection_title+': ') if subsection_title else '' + url=re.sub('www.tnr.com','www.tnr.com/print', a['href']) + author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL) + title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author + articles.append({'title':title, 'url':url, 'description':'', 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index ae7c599328..d93ac2c49b 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,4 +1,4 @@ -import re, random +import random from calibre import browser from calibre.web.feeds.recipes import BasicNewsRecipe @@ -8,45 +8,43 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'The Sun UK' description = 'Articles from The Sun tabloid UK' __author__ = 'Dave Asbury' - # last updated 15/7/12 + # last updated 25/7/12 language = 'en_GB' oldest_article = 1 - max_articles_per_feed = 15 + max_articles_per_feed = 12 remove_empty_feeds = True no_stylesheets = True masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif' encoding = 'UTF-8' - - remove_empty_feeds = True remove_javascript = True no_stylesheets = True + + + #preprocess_regexps = [ + # (re.compile(r'