diff --git a/COPYRIGHT b/COPYRIGHT index eb4433f96d..f6eeffc5cd 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -41,6 +41,12 @@ License: Apache 2.0 The full text of the Apache 2.0 license is available at: http://www.apache.org/licenses/LICENSE-2.0 +Files: resources/viewer/mathjax/* +Copyright: Unknown +License: Apache 2.0 + The full text of the Apache 2.0 license is available at: + http://www.apache.org/licenses/LICENSE-2.0 + Files: /src/cherrypy/* Copyright: Copyright (c) 2004-2007, CherryPy Team (team@cherrypy.org) Copyright: Copyright (C) 2005, Tiago Cogumbreiro @@ -396,8 +402,9 @@ License: other Liberation Fonts ----------------- -calibre includes a copy of the liberation fonts, available from -https://calibre-ebook.com/downloads/liberation-fonts +calibre includes a copy of the liberation fonts in TTF format, licensed under +the SIL Open Font License, Version 1.1, and available from +https://fedorahosted.org/liberation-fonts/ BSD License (for all the BSD licensed code indicated above) ----------------------------------------------------------- diff --git a/Changelog.yaml b/Changelog.yaml index 99bcfcfeb0..7ca8252a44 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,249 @@ # new recipes: # - title: +- version: 0.8.65 + date: 2012-08-17 + + new features: + - title: "A new wireless device driver. This allows connecting wirelessly to a device running a 'smart' calibre client" + description: "The wireless connection functions just as if the device was plugged into the computer by USB cable. Currently, Android devices are supported. See https://play.google.com/store/apps/details?id=com.multipie.calibreandroid" + type: major + + - title: "MOBI Output: Add an option to control the type of MOBI file produced, to the MOBI Output conversion options. You can now generate an old MOBI6, a new KF8 or a joint MOBI6/KF8 file. By default, MOBI6 files are generated. This replaces the previous use of a tweak." + + - title: "E-book viewer: Make paged mode the default. You can go back to the old flow mode by clicking the button with the yellow scroll in the top right corner of the viewer." + + - title: "Driver for COBY kYROS MID7042 and Samsung Galaxy Ace S5839i" + + bug fixes: + - title: "Update version of poppler bundled with calibre to fix reading covers from some PDF files" + + - title: "Get Books: Fix clicking of results from Diesel books when there is only a single result not working" + + - title: "Improve detection of system language on first run of calibre" + tickets: [1036354] + + - title: "When finding the next series index and the last series index is a fractional number, use the next largest integer, instead of just adding 1" + + - title: "Fix exception when saving a search/replace when no saved search/replace had been opened previously in the bulk search/replace dialog" + tickets: [1036464] + + - title: "Fix restore database not restoring entries for the original_* formats" + + - title: "Fix first run wizard not allowing empty email sending settings" + tickets: [1036358] + + - title: "Do not error out when setting the cover for a book that has no folders in the library." + tickets: [1035935] + + - title: "Conversion pipeline: Ignore unparseable values in the color attribute of font tags, instead of erroring out on them." + tickets: [1035633] + + - title: "Catalogs: Fix regression that broke creation of catalogs while a device is connected" + + - title: "Fix --with-library=/whatever not working for calibredb list" + + improved recipes: + - Slashdot + - Various Canadian newspapers + - Business Spectator + +- version: 0.8.64 + date: 2012-08-09 + + new features: + - title: "E-book viewer: Allow viewing images in the book in a separate pop-up window by right clicking on the image. Useful if you want to keep some image, like a map to the side while reading the book." + + - title: "Catalogs: Allow generation of catalogs in AZW3 format. Also add more powerful configuration options to exclude books and set prefixes. See http://www.mobileread.com/forums/showthread.php?t=187298 for details." + + - title: "Generate a PDF version of the User Manual" + + bug fixes: + - title: "News download: Fix broken handling of nesting for HTML 5 tags when parsing with BeautifulSoup" + + - title: "EPUB: Handle files in the EPUB that have semi-colons in their file names. This means in particular using URL escaping when creating the NCX as ADE cannot handle unescaped semi-colons in the NCX." + tickets: [1033665] + + - title: "Conversion pipeline: Ignore unparseable CSS instead of erroring out on it." + tickets: [1034074] + + - title: "When setting up a column coloring rule based on the languages column, allow entry of localized language names instead of only ISO codes" + + - title: "Catalogs: Generate cover for mobi/azw3 catalogs" + + - title: "Update the last modified column record of a book, whenever a format is added to the book." + + - title: "E-book viewer: Fix line scrolling stops at breaks option not working in paged mode" + tickets: [1033430] + + - title: "MOBI Output: Fix ToC at start option having no effect when converting some input documents that have an out-of-spine ToC." + tickets: [1033656] + + - title: "Catalog Generation: When generating EPUB/MOBI catalogs add more flexible rules for excluding books. Also add rules to customize the prefix characters used." + + - title: "Make setting published date using metadata search/replace more robust." + + - title: "Tag Browser: Flatten the display of sub-groups when sort by is not set to 'name'." + tickets: [1032746] + + - title: "Fix isbn:false not matching if other identifiers are attached to the book." + + improved recipes: + - The New Republic + - ZDNet + - Metro UK + - FHM UK + + new recipes: + - title: eKundelek.pl + author: Artur Stachecki + + - title: Sueddeutsche Mobil + author: Andreas Zeiser + +- version: 0.8.63 + date: 2012-08-02 + + new features: + - title: "E-book viewer: Allow quick saving and loading of viewer settings as 'themes'." + tickets: [1024611] + + - title: "Ebook-viewer: Add a restore defaults button to the viewer preferences dialog" + + - title: "E-book viewer: Add simple settings for text and background colors" + + - title: "Add an entry to save to disk when right clicking a format in the book details panel" + + - title: "ODT metadata: Read first image as the metadata cover from ODT files. Also allow ODT authors to set custom properties for extended metadata." + + - title: "E-book viewer and PDF Output: Resize images that are longer than the page to fit onto a single page" + + bug fixes: + - title: "KF8 Output: Fix bug where some calibre generated KF8 files would cause the Amazon KF8 viewer on the Touch to go to into an infinite loop when using the next page function" + tickets: [1026421] + + - title: "News download: Add support for tags that link to SVG images." + tickets: [1031553] + + - title: "Update podofo to 0.9.1 in all binary builds, to fix corruption of some PDFs when updating metadata." + tickets: [1031086] + + - title: "Catalog generation: Handle authors whose last name is a number." + + - title: "KF8 Input: Handle html entities in the NCX toc entries correctly" + + - title: "Fix a calibre crash that affected some windows installs" + tickets: [1030234] + + - title: "MOBI Output: Normalize unicode strings before writing to file, to workaround lack of support for non-normal unicode in Amazon's MOBI renderer." + tickets: [1029825] + + - title: "EPUB Input: Handle files that have duplicate entries in the spine" + + - title: "Fix regression in Kobo driver that caused the on device column to not be updated after deleting books" + + new recipes: + - title: Dziennik Polski + author: Gregorz Maj + + - title: High Country Blogs + author: Armin Geller + + - title: Philosophy Now + author: Rick Shang + +- version: 0.8.62 + date: 2012-07-27 + + new features: + - title: "Book details panel: Allow right clicking on a format to delete it." + + - title: "When errors occur in lots of background jobs, add an option to the error message to temporarily suppress subsequent error messages." + tickets: [886904] + + - title: "E-book viewer full screen mode: Allow clicking in the left and right page margins to turn pages." + tickets: [1024819] + + - title: "Drivers for various Android devices" + tickets: [1028690,1027431] + + - title: "Advanced search dialog: When starting on the title/author/etc. tab, restore the previously used search kind as well." + tickets: [1029745] + + - title: "When presenting the calibre must be restarted warning after installing a new plugin, add a restart now button so that the user can conveniently restart calibre. Currently only works when going vie Preferences->Plugins->Get new plugins" + + bug fixes: + - title: "Fix main window layout state being saved incorrectly if calibre is killed without a proper shutdown" + + - title: "Fix boolean and date searching in non english calibre installs." + + - title: "Conversion: Ignore invalid chapter detection and level n ToC expressions instead of erroring out" + + improved recipes: + - Psychology Today + - The Smithsonian + - The New Republic + - Various updated Polish news sources + - The Sun + - San Francisco Bay Guardian + - AnandTech + - Smashing Magazine + + new recipes: + - title: Linux Journal and Conowego.pl + author: fenuks + + - title: A list apart and .net magazine + author: Marc Busque + +- version: 0.8.61 + date: 2012-07-20 + + new features: + - title: "E-book viewer: Add a paged mode that splits up the text into pages, like in a paper book instead of presenting it as a single column. To activate click the button with the yellow scroll icon in the top right corner." + type: major + description: "In paged mode, the ebook viewer no longer cuts off the last line of text at the bottom of the screen, and it respects CSS page-break directives. You can also set page margins and control the number of pages displayed on screen by clicking the Preferences button in the viewer and going to 'Text layout in paged mode'." + + - title: "Digitally sign the calibre OS X and windows builds" + + - title: "Get Books: Add Mills and Boon UK" + + - title: "Various minor improvements to the Bulk metadata edit dialog" + tickets: [1025825, 1025838, 1025628] + + - title: "Fix various regression in the auto-complete functionality for authors/series/tags etc introduced in 0.8.60" + + - title: "Drivers for various new Android devices" + tickets: [1024934] + + - title: "MOBI: Add support for the new language EXTH header field in MOBI files generated by kindlegen 2.5" + + bug fixes: + - title: "KF8 Output: Fix calibre produced KF8 files not showing the 'Use publisher font' option on the Kindle Touch when they have embedded fonts" + + - title: "Txt/fb2/rtf/pml/rb output: Fix non-visibile element's tail text (which should be visible) is being ignored when it shouldn't." + tickets: [1026541] + + - title: "Book details panel: When displaying a link to amazon, use a country specific name like amazon.fr instead of using amazon.com for all countries" + + - title: "Conversion: When splitting on page breaks, ignore page-breaks with values of auto and inherit. " + tickets: [1018875] + + - title: "Metadata jacket: Specify foreground in addition to the background color for the title banner so that it remain readable if the user tries to monkey with the CSS in the viewer." + + - title: "PDF Output: Fix rendering of cover as first age of PDF (ignore margins so that the image covers the entire page)" + + - title: "Linux binaries: Bundle libglib to avoid incompatibilities with glib on various distros." + tickets: [1022019] + + - title: "Fix find_identical_books() choking on books with too many authors" + + + improved recipes: + - Toronto Star + - American Prospect + - faz.net + - version: 0.8.60 date: 2012-07-13 diff --git a/manual/Makefile b/manual/Makefile index c1a2279abf..a21de12bed 100644 --- a/manual/Makefile +++ b/manual/Makefile @@ -60,7 +60,7 @@ htmlhelp: latex: mkdir -p .build/latex .build/doctrees - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) .build/latex + $(SPHINXBUILD) -b mylatex $(ALLSPHINXOPTS) .build/latex @echo @echo "Build finished; the LaTeX files are in .build/latex." @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ diff --git a/manual/conf.py b/manual/conf.py index 7b24f2f50a..967b6f0c65 100644 --- a/manual/conf.py +++ b/manual/conf.py @@ -14,10 +14,10 @@ import sys, os # If your extensions are in another directory, add it here. -sys.path.append(os.path.abspath('../src')) sys.path.append(os.path.abspath('.')) -__appname__ = os.environ.get('__appname__', 'calibre') -__version__ = os.environ.get('__version__', '0.0.0') +import init_calibre +init_calibre +from calibre.constants import __appname__, __version__ import custom custom # General configuration @@ -154,7 +154,8 @@ latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, document class [howto/manual]). -#latex_documents = [] +latex_documents = [('index', 'calibre.tex', 'calibre User Manual', + 'Kovid Goyal', 'manual', False)] # Additional stuff for the LaTeX preamble. #latex_preamble = '' @@ -164,3 +165,11 @@ latex_font_size = '10pt' # If false, no module index is generated. #latex_use_modindex = True + +latex_logo = 'resources/logo.png' +latex_show_pagerefs = True +latex_show_urls = 'footnote' +latex_elements = { +'papersize':'letterpaper', +'fontenc':r'\usepackage[T2A,T1]{fontenc}' +} diff --git a/manual/conversion.rst b/manual/conversion.rst index 5eaca5a469..feae2a4273 100644 --- a/manual/conversion.rst +++ b/manual/conversion.rst @@ -710,3 +710,35 @@ EPUB from the ZIP file are:: Note that because this file explores the potential of EPUB, most of the advanced formatting is not going to work on readers less capable than |app|'s built-in EPUB viewer. + +Convert ODT documents +~~~~~~~~~~~~~~~~~~~~~ + +|app| can directly convert ODT (OpenDocument Text) files. You should use styles to format your document and minimize the use of direct formatting. +When inserting images into your document you need to anchor them to the paragraph, images anchored to a page will all end up in the front of the conversion. + +To enable automatic detection of chapters, you need to mark them with the build-in styles called 'Heading 1', 'Heading 2', ..., 'Heading 6' ('Heading 1' equates to the HTML tag

, 'Heading 2' to

etc). When you convert in |app| you can enter which style you used into the 'Detect chapters at' box. Example: + + * If you mark Chapters with style 'Heading 2', you have to set the 'Detect chapters at' box to ``//h:h2`` + * For a nested TOC with Sections marked with 'Heading 2' and the Chapters marked with 'Heading 3' you need to enter ``//h:h2|//h:h3``. On the Convert - TOC page set the 'Level 1 TOC' box to ``//h:h2`` and the 'Level 2 TOC' box to ``//h:h3``. + +Well-known document properties (Title, Keywords, Description, Creator) are recognized and |app| will use the first image (not to small, and with good aspect-ratio) as the cover image. + +There is also an advanced property conversion mode, which is activated by setting the custom property ``opf.metadata`` ('Yes or No' type) to Yes in your ODT document (File->Properties->Custom Properties). +If this property is detected by |app|, the following custom properties are recognized (``opf.authors`` overrides document creator):: + + opf.titlesort + opf.authors + opf.authorsort + opf.publisher + opf.pubdate + opf.isbn + opf.language + opf.series + opf.seriesindex + +In addition to this, you can specify the picture to use as the cover by naming it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no picture with this name is found, the 'smart' method is used. +As the cover detection might result in double covers in certain output formats, the process will remove the paragraph (only if the only content is the cover!) from the document. But this works only with the named picture! + +To disable cover detection you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes in advanced mode. + diff --git a/manual/custom.py b/manual/custom.py index fdfb5711bb..30ca28ec96 100644 --- a/manual/custom.py +++ b/manual/custom.py @@ -14,6 +14,7 @@ from sphinx.util.console import bold sys.path.append(os.path.abspath('../../../')) from calibre.linux import entry_points from epub import EPUBHelpBuilder +from latex import LaTeXHelpBuilder def substitute(app, doctree): pass @@ -251,6 +252,7 @@ def template_docs(app): def setup(app): app.add_config_value('kovid_epub_cover', None, False) app.add_builder(EPUBHelpBuilder) + app.add_builder(LaTeXHelpBuilder) app.connect('doctree-read', substitute) app.connect('builder-inited', generate_docs) app.connect('build-finished', finished) diff --git a/manual/develop.rst b/manual/develop.rst old mode 100755 new mode 100644 index 12bbcefe57..d59c315951 --- a/manual/develop.rst +++ b/manual/develop.rst @@ -6,9 +6,9 @@ Setting up a |app| development environment =========================================== |app| is completely open source, licensed under the `GNU GPL v3 `_. -This means that you are free to download and modify the program to your heart's content. In this section, -you will learn how to get a |app| development environment set up on the operating system of your choice. -|app| is written primarily in `Python `_ with some C/C++ code for speed and system interfacing. +This means that you are free to download and modify the program to your heart's content. In this section, +you will learn how to get a |app| development environment set up on the operating system of your choice. +|app| is written primarily in `Python `_ with some C/C++ code for speed and system interfacing. Note that |app| is not compatible with Python 3 and requires at least Python 2.7. .. contents:: Contents @@ -20,14 +20,14 @@ Design philosophy |app| has its roots in the Unix world, which means that its design is highly modular. The modules interact with each other via well defined interfaces. This makes adding new features and fixing -bugs in |app| very easy, resulting in a frenetic pace of development. Because of its roots, |app| has a +bugs in |app| very easy, resulting in a frenetic pace of development. Because of its roots, |app| has a comprehensive command line interface for all its functions, documented in :ref:`cli`. The modular design of |app| is expressed via ``Plugins``. There is a :ref:`tutorial ` on writing |app| plugins. For example, adding support for a new device to |app| typically involves writing less than a 100 lines of code in the form of -a device driver plugin. You can browse the -`built-in drivers `_. Similarly, adding support -for new conversion formats involves writing input/output format plugins. Another example of the modular design is the :ref:`recipe system ` for +a device driver plugin. You can browse the +`built-in drivers `_. Similarly, adding support +for new conversion formats involves writing input/output format plugins. Another example of the modular design is the :ref:`recipe system ` for fetching news. For more examples of plugins designed to add features to |app|, see the `plugin index `_. Code layout @@ -91,15 +91,15 @@ this, make your changes, then run:: This will create a :file:`my-changes` file in the current directory, simply attach that to a ticket on the |app| `bug tracker `_. -If you plan to do a lot of development on |app|, then the best method is to create a +If you plan to do a lot of development on |app|, then the best method is to create a `Launchpad `_ account. Once you have an account, you can use it to register your bzr branch created by the `bzr branch` command above. First run the following command to tell bzr about your launchpad account:: bzr launchpad-login your_launchpad_username -Now, you have to setup SSH access to Launchpad. First create an SSH public/private keypair. Then upload -the public key to Launchpad by going to your Launchpad account page. Instructions for setting up the +Now, you have to setup SSH access to Launchpad. First create an SSH public/private keypair. Then upload +the public key to Launchpad by going to your Launchpad account page. Instructions for setting up the private key in bzr are at http://bazaar-vcs.org/Bzr_and_SSH. Now you can upload your branch to the |app| project in Launchpad by following the instructions at https://help.launchpad.net/Code/UploadingABranch. Whenever you commit changes to your branch with the command:: @@ -108,7 +108,7 @@ Whenever you commit changes to your branch with the command:: Kovid can merge it directly from your branch into the main |app| source tree. You should also keep an eye on the |app| `development forum `. Before making major changes, you should -discuss them in the forum or contact Kovid directly (his email address is all over the source code). +discuss them in the forum or contact Kovid directly (his email address is all over the source code). Windows development environment --------------------------------- @@ -118,12 +118,12 @@ the previously checked out |app| code directory. For example:: cd C:\Users\kovid\work\calibre -calibre is the directory that contains the src and resources sub-directories. +calibre is the directory that contains the src and resources sub-directories. The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory. So, following the example above, it would be ``C:\Users\kovid\work\calibre\src``. `Here is a short guide `_ to setting environment -variables on Windows. +variables on Windows. Once you have set the environment variable, open a new command prompt and check that it was correctly set by using the command:: @@ -134,7 +134,7 @@ Setting this environment variable means that |app| will now load all its Python That's it! You are now ready to start hacking on the |app| code. For example, open the file :file:`src\\calibre\\__init__.py` in your favorite editor and add the line:: - + print ("Hello, world!") near the top of the file. Now run the command :command:`calibredb`. The very first line of output should be ``Hello, world!``. @@ -149,24 +149,25 @@ the previously checked out |app| code directory, for example:: calibre is the directory that contains the src and resources sub-directories. Ensure you have installed the |app| commandline tools via :guilabel:`Preferences->Advanced->Miscellaneous` in the |app| GUI. -The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory. -So, following the example above, it would be ``/Users/kovid/work/calibre/src``. Apple -`documentation `_ -on how to set environment variables. +The next step is to create a bash script that will set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory when running calibre in debug mode. -Once you have set the environment variable, open a new Terminal and check that it was correctly set by using -the command:: +Create a plain text file:: - echo $CALIBRE_DEVELOP_FROM + #!/bin/sh + export CALIBRE_DEVELOP_FROM="/Users/kovid/work/calibre/src" + calibre-debug -g -Setting this environment variable means that |app| will now load all its Python code from the specified location. +Save this file as ``/usr/bin/calibre-develop``, then set its permissions so that it can be executed:: -That's it! You are now ready to start hacking on the |app| code. For example, open the file :file:`src/calibre/__init__.py` -in your favorite editor and add the line:: - - print ("Hello, world!") + chmod +x /usr/bin/calibre-develop -near the top of the file. Now run the command :command:`calibredb`. The very first line of output should be ``Hello, world!``. +Once you have done this, run:: + + calibre-develop + +You should see some diagnostic information in the Terminal window as calibre +starts up, and you should see an asterisk after the version number in the GUI +window, indicating that you are running from source. Linux development environment ------------------------------ @@ -181,11 +182,11 @@ Install the |app| using the binary installer. Then open a terminal and change to cd /home/kovid/work/calibre -calibre is the directory that contains the src and resources sub-directories. +calibre is the directory that contains the src and resources sub-directories. The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory. So, following the example above, it would be ``/home/kovid/work/calibre/src``. How to set environment variables depends on -your Linux distribution and what shell you are using. +your Linux distribution and what shell you are using. Once you have set the environment variable, open a new terminal and check that it was correctly set by using the command:: @@ -196,7 +197,7 @@ Setting this environment variable means that |app| will now load all its Python That's it! You are now ready to start hacking on the |app| code. For example, open the file :file:`src/calibre/__init__.py` in your favorite editor and add the line:: - + print ("Hello, world!") near the top of the file. Now run the command :command:`calibredb`. The very first line of output should be ``Hello, world!``. diff --git a/manual/gui.rst b/manual/gui.rst index d82db2772a..c57b49185d 100755 --- a/manual/gui.rst +++ b/manual/gui.rst @@ -548,7 +548,7 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes - Toggle jobs list * - :kbd:`Alt+Shift+B` - Toggle Cover Browser - * - :kbd:`Alt+Shift+B` + * - :kbd:`Alt+Shift+D` - Toggle Book Details panel * - :kbd:`Alt+Shift+T` - Toggle Tag Browser diff --git a/manual/images/lorentz.png b/manual/images/lorentz.png new file mode 100644 index 0000000000..4b261dde38 Binary files /dev/null and b/manual/images/lorentz.png differ diff --git a/manual/index.rst b/manual/index.rst index fa89dba95f..b8f98a5561 100755 --- a/manual/index.rst +++ b/manual/index.rst @@ -17,7 +17,7 @@ To get started with more advanced usage, you should read about the :ref:`Graphic .. only:: online - **An ebook version of this user manual is available in** `EPUB format `_ and `AZW3 (Kindle Fire) format `_. + **An ebook version of this user manual is available in** `EPUB format `_, `AZW3 (Kindle Fire) format `_ and `PDF format `_. Sections ------------ diff --git a/manual/latex.py b/manual/latex.py new file mode 100644 index 0000000000..95f38eab20 --- /dev/null +++ b/manual/latex.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os + + +from sphinx.builders.latex import LaTeXBuilder + +class LaTeXHelpBuilder(LaTeXBuilder): + name = 'mylatex' + + def finish(self): + LaTeXBuilder.finish(self) + self.info('Fixing Cyrillic characters...') + tex = os.path.join(self.outdir, 'calibre.tex') + with open(tex, 'r+b') as f: + raw = f.read().replace(b'Михаил Горбачёв', + br'{\fontencoding{T2A}\selectfont Михаил Горбачёв}') + f.seek(0) + f.write(raw) diff --git a/manual/mathjax.html b/manual/mathjax.html new file mode 100644 index 0000000000..0cf8c1f8b1 --- /dev/null +++ b/manual/mathjax.html @@ -0,0 +1,102 @@ + + + + +Math Test Page + + + + + + + + + +

Sample Equations

+ +

The Lorenz Equations

+ +

+\begin{align} +\dot{x} & = \sigma(y-x) \label{lorenz}\\ +\dot{y} & = \rho x - y - xz \\ +\dot{z} & = -\beta z + xy +\end{align} +

+ +

The Cauchy-Schwarz Inequality

+ +

\[ +\left( \sum_{k=1}^n a_k b_k \right)^{\!\!2} \leq + \left( \sum_{k=1}^n a_k^2 \right) \left( \sum_{k=1}^n b_k^2 \right) +\]

+ +

A Cross Product Formula

+ +

\[ + \mathbf{V}_1 \times \mathbf{V}_2 = + \begin{vmatrix} + \mathbf{i} & \mathbf{j} & \mathbf{k} \\ + \frac{\partial X}{\partial u} & \frac{\partial Y}{\partial u} & 0 \\ + \frac{\partial X}{\partial v} & \frac{\partial Y}{\partial v} & 0 \\ + \end{vmatrix} +\]

+ +

The probability of getting \(k\) heads when flipping \(n\) coins is:

+ +

\[P(E) = {n \choose k} p^k (1-p)^{ n-k} \]

+ +

An Identity of Ramanujan

+ +

\[ + \frac{1}{(\sqrt{\phi \sqrt{5}}-\phi) e^{\frac25 \pi}} = + 1+\frac{e^{-2\pi}} {1+\frac{e^{-4\pi}} {1+\frac{e^{-6\pi}} + {1+\frac{e^{-8\pi}} {1+\ldots} } } } +\]

+ +

A Rogers-Ramanujan Identity

+ +

\[ + 1 + \frac{q^2}{(1-q)}+\frac{q^6}{(1-q)(1-q^2)}+\cdots = + \prod_{j=0}^{\infty}\frac{1}{(1-q^{5j+2})(1-q^{5j+3})}, + \quad\quad \text{for $|q|<1$}. +\]

+ +

Maxwell's Equations

+ +

+\begin{align} + \nabla \times \vec{\mathbf{B}} -\, \frac1c\, \frac{\partial\vec{\mathbf{E}}}{\partial t} & = \frac{4\pi}{c}\vec{\mathbf{j}} \\ + \nabla \cdot \vec{\mathbf{E}} & = 4 \pi \rho \\ + \nabla \times \vec{\mathbf{E}}\, +\, \frac1c\, \frac{\partial\vec{\mathbf{B}}}{\partial t} & = \vec{\mathbf{0}} \\ + \nabla \cdot \vec{\mathbf{B}} & = 0 +\end{align} +

+ +

In-line Mathematics

+ +

While display equations look good for a page of samples, the +ability to mix math and text in a paragraph is also important. This +expression \(\sqrt{3x-1}+(1+x)^2\) is an example of an inline equation. As +you see, equations can be used this way as well, without unduly +disturbing the spacing between lines.

+ +

References to equations

+ +

Here is a reference to the Lorenz Equations (\ref{lorenz}). Clicking on the equation number will take you back to the equation.

+ + + diff --git a/manual/news.rst b/manual/news.rst index 873025d467..9783a262aa 100755 --- a/manual/news.rst +++ b/manual/news.rst @@ -30,7 +30,7 @@ Lets pick a couple of feeds that look interesting: #. Business Travel: http://feeds.portfolio.com/portfolio/businesstravel #. Tech Observer: http://feeds.portfolio.com/portfolio/thetechobserver -I got the URLs by clicking the little orange RSS icon next to each feed name. To make |app| download the feeds and convert them into an ebook, you should click the :guilabel:`Fetch news` button and then the :guilabel:`Add a custom news source` menu item. A dialog similar to that shown below should open up. +I got the URLs by clicking the little orange RSS icon next to each feed name. To make |app| download the feeds and convert them into an ebook, you should right click the :guilabel:`Fetch news` button and then the :guilabel:`Add a custom news source` menu item. A dialog similar to that shown below should open up. .. image:: images/custom_news.png :align: center diff --git a/manual/resources/mathjax.epub b/manual/resources/mathjax.epub new file mode 100644 index 0000000000..09b1e45b01 Binary files /dev/null and b/manual/resources/mathjax.epub differ diff --git a/manual/tutorials.rst b/manual/tutorials.rst index dd41c730b0..420c1a4cbd 100755 --- a/manual/tutorials.rst +++ b/manual/tutorials.rst @@ -18,4 +18,5 @@ Here you will find tutorials to get you started using |app|'s more advanced feat regexp server creating_plugins + typesetting_math diff --git a/manual/typesetting_math.rst b/manual/typesetting_math.rst new file mode 100644 index 0000000000..dd3d574c3a --- /dev/null +++ b/manual/typesetting_math.rst @@ -0,0 +1,70 @@ + +.. include:: global.rst + +.. _typesetting_math: + + +Typesetting Math in ebooks +============================ + +The |app| ebook viewer has the ability to display math embedded in ebooks (ePub +and HTML files). You can typeset the math directly with TeX or MathML or +AsciiMath. The |app| viewer uses the excellent `MathJax +`_ library to do this. This is a brief tutorial on +creating ebooks with math in them that work well with the |app| viewer. + +.. note:: + This only applies to calibre version 0.8.66 and newer + +A simple HTML file with mathematics +------------------------------------- + +You can write mathematics inline inside a simple HTML file and the |app| viewer +will render it into properly typeset mathematics. In the example below, we use +TeX notation for mathematics. You will see that you can use normal TeX +commands, with the small caveat that ampersands and less than and greater than +signs have to be written as & < and > respectively. + +The first step is to tell |app| that this will contains maths. You do this by +adding the following snippet of code to the section of the HTML file:: + + + +That's it, now you can type mathematics just as you would in a .tex file. For +example, here are Lorentz's equations:: + +

The Lorenz Equations

+ +

+ \begin{align} + \dot{x} & = \sigma(y-x) \\ + \dot{y} & = \rho x - y - xz \\ + \dot{z} & = -\beta z + xy + \end{align} +

+ +This snippet looks like the following screen shot in the |app| viewer. + +.. figure:: images/lorentz.png + :align: center + + :guilabel:`The Lorentz Equations` + +The complete HTML file, with more equations and inline mathematics is +reproduced below. You can convert this HTML file to EPUB in |app| to end up +with an ebook you can distribute easily to other people. + +.. only:: online + + Here is the generated EPUB file: `mathjax.epub <_static/mathjax.epub>`_. + +.. literalinclude:: mathjax.html + :language: html + +More information +----------------- + +Since the |app| viewer uses the MathJax library to render mathematics, the best +place to find out more about math in ebooks and get help is the `MathJax +website `_. + diff --git a/recipes/anandtech.recipe b/recipes/anandtech.recipe index aa10084070..ff08c828ac 100644 --- a/recipes/anandtech.recipe +++ b/recipes/anandtech.recipe @@ -21,8 +21,12 @@ class anan(BasicNewsRecipe): remove_javascript = True encoding = 'utf-8' - remove_tags=[dict(name='a', attrs={'style':'width:110px; margin-top:0px;text-align:center;'}), - dict(name='a', attrs={'style':'width:110px; margin-top:0px; margin-right:20px;text-align:center;'})] + remove_tags=[ + dict(name='a', attrs={'style':'width:110px; margin-top:0px;text-align:center;'}), + dict(name='a', attrs={'style':'width:110px; margin-top:0px; margin-right:20px;text-align:center;'}), + {'attrs':{'class':['article_links', 'header', 'body_right']}}, + {'id':['crumbs']}, + ] feeds = [ ('Anandtech', 'http://www.anandtech.com/rss/')] diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 00eea1be68..9544abdfcf 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -1,6 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe import re -class Benchmark_pl(BasicNewsRecipe): +class BenchmarkPl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' @@ -14,7 +14,7 @@ class Benchmark_pl(BasicNewsRecipe): preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] + remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] diff --git a/recipes/brecha.recipe b/recipes/brecha.recipe new file mode 100644 index 0000000000..da58710dd5 --- /dev/null +++ b/recipes/brecha.recipe @@ -0,0 +1,82 @@ +from __future__ import (unicode_literals, division, absolute_import, + print_function) +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +www.brecha.com.uy +''' + +import urllib +from calibre.web.feeds.news import BasicNewsRecipe + +class Brecha(BasicNewsRecipe): + title = 'Brecha Digital' + __author__ = 'Darko Miletic' + description = 'Brecha , Cultura ,Sociales , Separatas, Lupas, Vueltas de Montevideo y toda la infomacion que caracteriza a este semanario' + publisher = 'Brecha' + category = 'brecha, digital, prensa, uruguay, semanario, sociedad, politica, cultura' + oldest_article = 7 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'es_UY' + remove_empty_feeds = True + publication_type = 'magazine' + auto_cleanup = True + needs_subscription = 'optional' + masthead_url = 'http://www.brecha.com.uy/templates/ja_nex/themes/orange/images/logo.png' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open('http://www.brecha.com.uy/index.php/acceder-miembros') + if self.username is not None and self.password is not None: + data = urllib.urlencode({ 'task':'login' + ,'view':'register' + ,'username':self.username + ,'password':self.password + }) + br.open('http://www.brecha.com.uy/index.php/index.php?option=com_osemsc&controller=register',data) + return br + + remove_tags = [ + dict(name=['meta','link']), + dict(name='div', attrs={'id':'js_ja'}), + dict(name='ul', attrs={'class':'actions'}) + ] + remove_attributes=['lang', 'border'] + + feeds = [ + (u'Politica' , u'http://www.brecha.com.uy/index.php/politica-uruguaya?format=feed&type=rss'), + (u'Mundo' , u'http://www.brecha.com.uy/index.php/mundo?format=feed&type=rss'), + (u'Mapamundi', u'http://www.brecha.com.uy/index.php/mundo/mapamundi?format=feed&type=rss'), + (u'Cultura' , u'http://www.brecha.com.uy/index.php/cultura?format=feed&type=rss'), + (u'Vueltas de Montevideo', u'http://www.brecha.com.uy/index.php/cultura/vueltas-de-montevideo?format=feed&type=rss'), + (u'Secos y Mojados', u'http://www.brecha.com.uy/index.php/cultura/secos-y-mojados?format=feed&type=rss'), + (u'Literarias', u'http://www.brecha.com.uy/index.php/cultura/literarias?format=feed&type=rss'), + (u'Sociedad', u'http://www.brecha.com.uy/index.php/sociedad?format=feed&type=rss'), + (u'Especiales', u'http://www.brecha.com.uy/index.php/especiales?format=feed&type=rss'), + (u'Contratapa', u'http://www.brecha.com.uy/index.php/contratapa?format=feed&type=rss') + ] + + def print_version(self, url): + return url + '?tmpl=component&print=1&layout=default&page=' + + def get_cover_url(self): + soup = self.index_to_soup('http://www.brecha.com.uy/index.php') + for image in soup.findAll('img', alt=True): + if image['alt'].startswith('Tapa '): + return 'http://www.brecha.com.uy' + urllib.quote(image['src']) + return None diff --git a/recipes/business_spectator.recipe b/recipes/business_spectator.recipe index ef58424c6c..9ed3f1f7ac 100644 --- a/recipes/business_spectator.recipe +++ b/recipes/business_spectator.recipe @@ -16,6 +16,7 @@ class BusinessSpectator(BasicNewsRecipe): oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True + auto_cleanup = True #delay = 1 use_embedded_content = False encoding = 'utf8' @@ -32,11 +33,11 @@ class BusinessSpectator(BasicNewsRecipe): ,'linearize_tables': False } - keep_only_tags = [dict(id='storyHeader'), dict(id='body-html')] + #keep_only_tags = [dict(id='storyHeader'), dict(id='body-html')] - remove_tags = [dict(attrs={'class':'hql'})] + #remove_tags = [dict(attrs={'class':'hql'})] - remove_attributes = ['width','height','style'] + #remove_attributes = ['width','height','style'] feeds = [ ('Top Stories', 'http://www.businessspectator.com.au/top-stories.rss'), @@ -46,3 +47,4 @@ class BusinessSpectator(BasicNewsRecipe): ('Daily Dossier', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=kgb&cat=dossier'), ('Australia', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=region&cat=australia'), ] + diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe index 12134bc9a4..d1b28de9de 100644 --- a/recipes/calgary_herald.recipe +++ b/recipes/calgary_herald.recipe @@ -1,35 +1,314 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class CalgaryHerald(BasicNewsRecipe): - title = u'Calgary Herald' - oldest_article = 3 - max_articles_per_feed = 100 - - feeds = [ - (u'News', u'http://rss.canada.com/get/?F233'), - (u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'), - (u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'), - (u'Politics', u'http://rss.canada.com/get/?F7551'), - (u'National', u'http://rss.canada.com/get/?F7552'), - (u'World', u'http://rss.canada.com/get/?F7553'), - ] - __author__ = 'rty' - pubisher = 'Calgary Herald' - description = 'Calgary, Alberta, Canada' - category = 'News, Calgary, Alberta, Canada' - - - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - language = 'en_CA' - encoding = 'utf-8' - conversion_options = {'linearize_tables':True} - ##masthead_url = 'http://www.calgaryherald.com/index.html' - keep_only_tags = [ - dict(name='div', attrs={'id':'storyheader'}), - dict(name='div', attrs={'id':'storycontent'}) - - ] - remove_tags_after = {'class':"story_tool_hr"} - +#!/usr/bin/env python +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' + +''' +www.canada.com +''' +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup + + +class CanWestPaper(BasicNewsRecipe): + + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] + + + # un-comment the following six lines for the Vancouver Province +## title = u'Vancouver Province' +## url_prefix = 'http://www.theprovince.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' + + # un-comment the following six lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' +## fp_tag = 'CAN_VS' + + # un-comment the following six lines for the Calgary Herald + title = u'Calgary Herald' + url_prefix = 'http://www.calgaryherald.com' + description = u'News from Calgary, AB' + std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' + logo_url = 'chlogo.jpg' + fp_tag = 'CAN_CH' + + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' + + # un-comment the following six lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' +## fp_tag = 'CAN_OC' + + # un-comment the following six lines for the Montreal Gazette +## title = u'Montreal Gazette' +## url_prefix = 'http://www.montrealgazette.com' +## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' +## fp_tag = 'CAN_MG' + + Kindle_Fire=False + masthead_url = std_logo_url + + url_list = [] + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + encoding = 'utf-8' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: small; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + + def get_cover_url(self): + from datetime import timedelta, date + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() + return self.strip_anchors(soup) + + + + def parse_index(self): + + articles = {} + ans = [] + + + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans + diff --git a/recipes/conowego_pl.recipe b/recipes/conowego_pl.recipe new file mode 100755 index 0000000000..8b4288ddcd --- /dev/null +++ b/recipes/conowego_pl.recipe @@ -0,0 +1,38 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +class CoNowegoPl(BasicNewsRecipe): + title = u'conowego.pl' + __author__ = 'fenuks' + description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !' + cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png' + category = 'IT, news' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(name='div', attrs={'class':'news_list single_view'})] + remove_tags = [dict(name='div', attrs={'class':['ni_bottom', 'ni_rank', 'ni_date']})] + feeds = [(u'Aktualno\u015bci', u'http://www.conowego.pl/rss/aktualnosci-5/?type=100'), (u'Gaming', u'http://www.conowego.pl/rss/gaming-6/?type=100'), (u'Porady', u'http://www.conowego.pl/rss/porady-3/?type=100'), (u'Testy', u'http://www.conowego.pl/rss/testy-2/?type=100')] + + def preprocess_html(self, soup): + for i in soup.findAll('img'): + i.parent.insert(0, BeautifulSoup('
')) + i.insert(len(i), BeautifulSoup('
')) + self.append_page(soup, soup.body) + return soup + + + def append_page(self, soup, appendtag): + tag = appendtag.find('div', attrs={'class':'pages'}) + if tag: + nexturls=tag.findAll('a') + for nexturl in nexturls[:-1]: + soup2 = self.index_to_soup('http://www.conowego.pl/' + nexturl['href']) + pagetext = soup2.find(attrs={'class':'ni_content'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}): + r.extract() diff --git a/recipes/dot_net.recipe b/recipes/dot_net.recipe new file mode 100644 index 0000000000..50db71e9be --- /dev/null +++ b/recipes/dot_net.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class NetMagazineRecipe (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'.net magazine' + description = u'net is the world’s best-selling magazine for web designers and developers, featuring tutorials from leading agencies, interviews with the web’s biggest names, and agenda-setting features on the hottest issues affecting the internet today.' + language = 'en' + tags = 'web development, software' + oldest_article = 7 + remove_empty_feeds = True + no_stylesheets = True + cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png' + keep_only_tags = [ + dict(name='article', attrs={'class': re.compile('^node.*$', re.IGNORECASE)}) + ] + remove_tags = [ + dict(name='span', attrs={'class': 'comment-count'}), + dict(name='div', attrs={'class': 'item-list share-links'}), + dict(name='footer'), + ] + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height', 'style'] + extra_css = 'img {max-width: 100%; display: block; margin: auto;} .captioned-image div {text-align: center; font-style: italic;}' + + feeds = [ + (u'.net', u'http://feeds.feedburner.com/net/topstories'), + ] diff --git a/recipes/dziennik_polski.recipe b/recipes/dziennik_polski.recipe new file mode 100644 index 0000000000..83b9d06ecd --- /dev/null +++ b/recipes/dziennik_polski.recipe @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +__license__='GPL v3' +__author__='grzegorz.maj@dziennik.krakow.pl>' + +''' +http://dziennikpolski24.pl +Author: grzegorz.maj@dziennik.krakow.pl +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class DziennikPolski24(BasicNewsRecipe): + + title=u'Dziennik Polski' + publisher=u'Grupa Polskapresse' + + __author__='grzegorz.maj' + description=u'Wiadomości z wydania Dziennika Polskiego' + oldest_article=1 + max_articles_per_feed=50 + needs_subscription=True + + remove_javascript=True + no_stylesheets=True + use_embedded_content=False + remove_empty_feeds=True + extra_css='.date{margin-top: 4em;} .logo_author{margin-left:0.5em;}' + + publication_type='newspaper' + cover_url='http://www.dziennikpolski24.pl/_p/images/logoDP24-b.gif' + INDEX='http://dziennikpolski24.pl/' + + encoding='utf-8' + language='pl' + + keep_only_tags=[ + + dict(name = 'div', attrs = {'class':['toolbar']}) + , dict(name = 'h1') + , dict(name = 'h2', attrs = {'class':['teaser']}) + , dict(name = 'div', attrs = {'class':['picture']}) + , dict(name = 'div', attrs = {'id':['showContent']}) + , dict(name = 'div', attrs = {'class':['paging']}) + , dict(name = 'div', attrs = {'class':['wykupTresc']}) + ] + + remove_tags=[ + + ] + + feeds=[ + (u'Kraj', u'http://www.dziennikpolski24.pl/rss/feed/1151') + , (u'Świat', u'http://www.dziennikpolski24.pl/rss/feed/1153') + , (u'Gospodarka', u'http://www.dziennikpolski24.pl/rss/feed/1154') + , (u'Małopolska', u'http://www.dziennikpolski24.pl/rss/feed/1155') + , (u'Kultura', u'http://www.dziennikpolski24.pl/rss/feed/1156') + , (u'Opinie', u'http://www.dziennikpolski24.pl/rss/feed/1158') + , (u'Kronika Nowohucka', u'http://www.dziennikpolski24.pl/rss/feed/1656') + , (u'Na bieżąco', u'http://www.dziennikpolski24.pl/rss/feed/1543') + , (u'Londyn 2012', u'http://www.dziennikpolski24.pl/rss/feed/2545') + , (u'Piłka nożna', u'http://www.dziennikpolski24.pl/rss/feed/2196') + , (u'Siatkówka', u'http://www.dziennikpolski24.pl/rss/feed/2197') + , (u'Koszykówka', u'http://www.dziennikpolski24.pl/rss/feed/2198') + , (u'Tenis', u'http://www.dziennikpolski24.pl/rss/feed/2199') + , (u'Formuła 1', u'http://www.dziennikpolski24.pl/rss/feed/2203') + , (u'Lekkoatletyka', u'http://www.dziennikpolski24.pl/rss/feed/2204') + , (u'Żużel', u'http://www.dziennikpolski24.pl/rss/feed/2200') + , (u'Sporty motorowe', u'http://www.dziennikpolski24.pl/rss/feed/2206') + , (u'Publicystyka sportowa', u'http://www.dziennikpolski24.pl/rss/feed/2201') + , (u'Kolarstwo', u'http://www.dziennikpolski24.pl/rss/feed/2205') + , (u'Inne', u'http://www.dziennikpolski24.pl/rss/feed/2202') + , (u'Miasto Kraków', u'http://www.dziennikpolski24.pl/rss/feed/1784') + , (u'Region nowosądecki', u'http://www.dziennikpolski24.pl/rss/feed/1795') + , (u'Region Małopolski Zachodniej', u'http://www.dziennikpolski24.pl/rss/feed/1793') + , (u'Region tarnowski', u'http://www.dziennikpolski24.pl/rss/feed/1797') + , (u'Region podhalański', u'http://www.dziennikpolski24.pl/rss/feed/1789') + , (u'Region olkuski', u'http://www.dziennikpolski24.pl/rss/feed/1670') + , (u'Region miechowski', u'http://www.dziennikpolski24.pl/rss/feed/1806') + , (u'Region podkrakowski', u'http://www.dziennikpolski24.pl/rss/feed/1787') + , (u'Region proszowicki', u'http://www.dziennikpolski24.pl/rss/feed/1804') + , (u'Region wielicki', u'http://www.dziennikpolski24.pl/rss/feed/1802') + , (u'Region podbeskidzki', u'http://www.dziennikpolski24.pl/rss/feed/1791') + , (u'Region myślenicki', u'http://www.dziennikpolski24.pl/rss/feed/1800') + , (u'Autosalon', u'http://www.dziennikpolski24.pl/rss/feed/1294') + , (u'Kariera', u'http://www.dziennikpolski24.pl/rss/feed/1289') + , (u'Przegląd nieruchomości', u'http://www.dziennikpolski24.pl/rss/feed/1281') + , (u'Magnes', u'http://www.dziennikpolski24.pl/rss/feed/1283') + , (u'Magazyn Piątek', u'http://www.dziennikpolski24.pl/rss/feed/1293') + , (u'Pejzaż rodzinny', u'http://www.dziennikpolski24.pl/rss/feed/1274') + , (u'Podróże', u'http://www.dziennikpolski24.pl/rss/feed/1275') + , (u'Konsument', u'http://www.dziennikpolski24.pl/rss/feed/1288') + ] + + def append_page(self, soup, appendtag): + loop=False + tag=soup.find('div', attrs = {'class':'paging'}) + if tag: + loop=True + li_nks=tag.findAll('li') + appendtag.find('div', attrs = {'class':'paging'}).extract() + if appendtag.find('ul', attrs = {'class':'menuf'}): + appendtag.find('ul', attrs = {'class':'menuf'}).extract() + while loop: + loop=False + for li_nk in li_nks: + link_tag=li_nk.contents[0].contents[0].string + if u'następna' in link_tag: + soup2=self.index_to_soup(self.INDEX+li_nk.contents[0]['href']) + if soup2.find('div', attrs = {'id':'showContent'}): + pagetext=soup2.find('div', attrs = {'id':'showContent'}) + pos=len(appendtag.contents) + appendtag.insert(pos, pagetext) + if soup2.find('div', attrs = {'class':'rightbar'}): + pagecont=soup2.find('div', attrs = {'class':'rightbar'}) + tag=pagecont.find('div', attrs = {'class':'paging'}) + li_nks=tag.findAll('li') + loop=True + + def get_browser(self): + br=BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open('http://www.dziennikpolski24.pl/pl/moje-konto/950606-loguj.html') + br.select_form(nr = 1) + br["user_login[login]"]=self.username + br['user_login[pass]']=self.password + br.submit() + return br + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup + diff --git a/recipes/edmonton_journal.recipe b/recipes/edmonton_journal.recipe index e0c02b7d83..d3fdbc84f3 100644 --- a/recipes/edmonton_journal.recipe +++ b/recipes/edmonton_journal.recipe @@ -1,105 +1,136 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] - # un-comment the following four lines for the Vancouver Province + + # un-comment the following six lines for the Vancouver Province ## title = u'Vancouver Province' ## url_prefix = 'http://www.theprovince.com' ## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' - # un-comment the following four lines for the Vancouver Sun + # un-comment the following six lines for the Vancouver Sun ## title = u'Vancouver Sun' ## url_prefix = 'http://www.vancouversun.com' ## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' ## fp_tag = 'CAN_VS' - # un-comment the following four lines for the Edmonton Journal - title = u'Edmonton Journal' - url_prefix = 'http://www.edmontonjournal.com' - description = u'News from Edmonton, AB' - fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald + # un-comment the following six lines for the Calgary Herald ## title = u'Calgary Herald' ## url_prefix = 'http://www.calgaryherald.com' ## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' ## fp_tag = 'CAN_CH' - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' + # un-comment the following six lines for the Edmonton Journal + title = u'Edmonton Journal' + url_prefix = 'http://www.edmontonjournal.com' + description = u'News from Edmonton, AB' + std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' + logo_url = 'ejlogo.jpg' + fp_tag = 'CAN_EJ' - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen + # un-comment the following six lines for the Ottawa Citizen ## title = u'Ottawa Citizen' ## url_prefix = 'http://www.ottawacitizen.com' ## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' ## fp_tag = 'CAN_OC' - # un-comment the following four lines for the Montreal Gazette + # un-comment the following six lines for the Montreal Gazette ## title = u'Montreal Gazette' ## url_prefix = 'http://www.montrealgazette.com' ## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' ## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + def get_cover_url(self): from datetime import timedelta, date - if self.fp_tag=='': - return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser() daysback=1 @@ -120,6 +151,18 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +209,106 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() return self.strip_anchors(soup) def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/recipes/ekundelek_pl.recipe b/recipes/ekundelek_pl.recipe new file mode 100644 index 0000000000..ebc5d39bbd --- /dev/null +++ b/recipes/ekundelek_pl.recipe @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = u'2012, Artur Stachecki ' + +from calibre.web.feeds.news import BasicNewsRecipe + +class swiatczytnikow(BasicNewsRecipe): + title = u'eKundelek' + description = u'Najsympatyczniejszy blog o e-czytnikach Kindle' + language = 'pl' + __author__ = u'Artur Stachecki' + oldest_article = 7 + max_articles_per_feed = 100 + + remove_tags = [dict(name = 'div', attrs = {'class' : 'feedflare'})] + + feeds = [(u'Wpisy', u'http://feeds.feedburner.com/Ekundelekpl?format=xml')] diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe index 6ee5ae3fb6..84455ddd3c 100644 --- a/recipes/fhm_uk.recipe +++ b/recipes/fhm_uk.recipe @@ -18,15 +18,15 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): keep_only_tags = [ dict(name='h1'), dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}), - dict(name='div',attrs={'id' : ['articleLeft']}), - dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody']}), + dict(name='div',attrs={'id' : ['profileLeft','articleLeft','profileRight','profileBody']}), + dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody',]}), ] - #remove_tags = [ - #dict(attrs={'class' : ['player']}), + remove_tags = [ + dict(attrs={'id' : ['ctl00_Body_divSlideShow' ]}), - #] + ] feeds = [ (u'Homepage 1',u'http://feed43.com/6655867614547036.xml'), (u'Homepage 2',u'http://feed43.com/4167731873103110.xml'), @@ -34,7 +34,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): (u'Homepage 4',u'http://feed43.com/6550421522527341.xml'), (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'), (u'Gaming',u'http://feed43.com/6537162612465672.xml'), - (u'Girls',u'http://feed43.com/3674777224513254.xml'), + (u'Girls',u'http://feed43.com/4574262733341068.xml'),# edit link http://feed43.com/feed.html?name=4574262733341068 ] extra_css = ''' diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 2a6e00d501..ba34c9ff63 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -1,6 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe - -class Filmweb_pl(BasicNewsRecipe): +import re +from calibre.ebooks.BeautifulSoup import BeautifulSoup +class FilmWebPl(BasicNewsRecipe): title = u'FilmWeb' __author__ = 'fenuks' description = 'FilmWeb - biggest polish movie site' @@ -12,8 +13,9 @@ class Filmweb_pl(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets= True remove_empty_feeds=True + preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' - remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})] + remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'), (u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), @@ -31,18 +33,22 @@ class Filmweb_pl(BasicNewsRecipe): (u'News / Kino polskie', u'http://www.filmweb.pl/feed/news/category/polish.cinema'), (u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), (u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), - (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest')] + (u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest') + ] - def skip_ad_pages(self, soup): + def skip_ad_pages(self, soup): skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'}) if skip_tag is not None: - self.log.warn('skip_tag') - self.log.warn(skip_tag) return self.index_to_soup(skip_tag['href'], raw=True) - + def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + for i in soup.findAll('a', attrs={'class':'fn'}): + i.insert(len(i), BeautifulSoup('
')) + for i in soup.findAll('sup'): + if not i.string or i.string.startswith('(kliknij'): + i.extract() + return soup diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index e188e4988c..fce9674081 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -1,6 +1,6 @@ from calibre.web.feeds.recipes import BasicNewsRecipe -class Gry_online_pl(BasicNewsRecipe): +class GryOnlinePl(BasicNewsRecipe): title = u'Gry-Online.pl' __author__ = 'fenuks' description = 'Gry-Online.pl - computer games' @@ -21,17 +21,18 @@ class Gry_online_pl(BasicNewsRecipe): tag = appendtag.find('div', attrs={'class':'n5p'}) if tag: nexturls=tag.findAll('a') - for nexturl in nexturls[1:]: - try: - soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href']) - except: - soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href']) + url_part = soup.find('link', attrs={'rel':'canonical'})['href'] + url_part = url_part[25:].rpartition('?')[0] + for nexturl in nexturls[1:-1]: + soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href']) pagetext = soup2.find(attrs={'class':'gc660'}) for r in pagetext.findAll(name='header'): r.extract() + for r in pagetext.findAll(attrs={'itemprop':'description'}): + r.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}): + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}): r.extract() diff --git a/recipes/high_country_blogs.recipe b/recipes/high_country_blogs.recipe new file mode 100644 index 0000000000..5173c30596 --- /dev/null +++ b/recipes/high_country_blogs.recipe @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal , Armin Geller' + +''' +Fetch High Country News - Blogs +''' +from calibre.web.feeds.news import BasicNewsRecipe +class HighCountryNewsBlogs(BasicNewsRecipe): + + title = u'High Country News - Blogs' + description = u'High Country News - Blogs (RSS Version)' + __author__ = 'Armin Geller' # 2012-08-01 + publisher = 'High Country News' + category = 'news, politics, Germany' + timefmt = ' [%a, %d %b %Y]' + language = 'en' + encoding = 'UTF-8' + publication_type = 'newspaper' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + auto_cleanup = True + remove_javascript = True + use_embedded_content = False + masthead_url = 'http://www.hcn.org/logo.jpg' + cover_source = 'http://www.hcn.org' + + def get_cover_url(self): + cover_source_soup = self.index_to_soup(self.cover_source) + preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'}) + return preview_image_div.div.img['src'] + + feeds = [ + (u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'), + + (u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'), + (u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'), + (u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'), + ] + + def print_version(self, url): + return url + diff --git a/recipes/icons/conowego_pl.png b/recipes/icons/conowego_pl.png new file mode 100644 index 0000000000..3bc8f2c672 Binary files /dev/null and b/recipes/icons/conowego_pl.png differ diff --git a/recipes/icons/dziennik_polski.png b/recipes/icons/dziennik_polski.png new file mode 100644 index 0000000000..d06507eca7 Binary files /dev/null and b/recipes/icons/dziennik_polski.png differ diff --git a/recipes/icons/linux_journal.png b/recipes/icons/linux_journal.png new file mode 100644 index 0000000000..ed0092bd1d Binary files /dev/null and b/recipes/icons/linux_journal.png differ diff --git a/recipes/linux_journal.recipe b/recipes/linux_journal.recipe new file mode 100755 index 0000000000..99b1a570dc --- /dev/null +++ b/recipes/linux_journal.recipe @@ -0,0 +1,36 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class LinuxJournal(BasicNewsRecipe): + title = u'Linux Journal' + __author__ = 'fenuks' + description = u'The monthly magazine of the Linux community, promoting the use of Linux worldwide.' + cover_url = 'http://www.linuxjournal.com/files/linuxjournal.com/ufiles/logo-lj.jpg' + category = 'IT, Linux' + language = 'en' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + keep_only_tags=[dict(id='content-inner')] + remove_tags_after= dict(attrs={'class':'user-signature clear-block'}) + remove_tags=[dict(attrs={'class':['user-signature clear-block', 'breadcrumb', 'terms terms-inline']})] + feeds = [(u'Front Page', u'http://feeds.feedburner.com/linuxjournalcom'), (u'News', u'http://feeds.feedburner.com/LinuxJournal-BreakingNews'), (u'Blogs', u'http://www.linuxjournal.com/blog/feed'), (u'Audio/Video', u'http://www.linuxjournal.com/taxonomy/term/28/0/feed'), (u'Community', u'http://www.linuxjournal.com/taxonomy/term/18/0/feed'), (u'Education', u'http://www.linuxjournal.com/taxonomy/term/25/0/feed'), (u'Embedded', u'http://www.linuxjournal.com/taxonomy/term/27/0/feed'), (u'Hardware', u'http://www.linuxjournal.com/taxonomy/term/23/0/feed'), (u'HOWTOs', u'http://www.linuxjournal.com/taxonomy/term/19/0/feed'), (u'International', u'http://www.linuxjournal.com/taxonomy/term/30/0/feed'), (u'Security', u'http://www.linuxjournal.com/taxonomy/term/31/0/feed'), (u'Software', u'http://www.linuxjournal.com/taxonomy/term/17/0/feed'), (u'Sysadmin', u'http://www.linuxjournal.com/taxonomy/term/21/0/feed'), (u'Webmaster', u'http://www.linuxjournal.com/taxonomy/term/24/0/feed')] + + def append_page(self, soup, appendtag): + next = appendtag.find('li', attrs={'class':'pager-next'}) + while next: + nexturl = next.a['href'] + appendtag.find('div', attrs={'class':'links'}).extract() + soup2 = self.index_to_soup('http://www.linuxjournal.com'+ nexturl) + pagetext = soup2.find(attrs={'class':'node-inner'}).find(attrs={'class':'content'}) + next = appendtag.find('li', attrs={'class':'pager-next'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = appendtag.find('div', attrs={'class':'links'}) + if tag: + tag.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/list_apart.recipe b/recipes/list_apart.recipe new file mode 100644 index 0000000000..35cbaad958 --- /dev/null +++ b/recipes/list_apart.recipe @@ -0,0 +1,33 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from calibre.web.feeds.news import BasicNewsRecipe + +class AListApart (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'A List Apart' + description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices.' + language = 'en' + tags = 'web development, software' + oldest_article = 120 + remove_empty_feeds = True + no_stylesheets = True + encoding = 'utf8' + cover_url = u'http://alistapart.com/pix/alalogo.gif' + keep_only_tags = [ + dict(name='div', attrs={'id': 'content'}) + ] + remove_tags = [ + dict(name='ul', attrs={'id': 'metastuff'}), + dict(name='div', attrs={'class': 'discuss'}), + dict(name='div', attrs={'class': 'discuss'}), + dict(name='div', attrs={'id': 'learnmore'}), + ] + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] + extra_css = u'img {max-width: 100%; display: block; margin: auto;} #authorbio img {float: left; margin-right: 2%;}' + + feeds = [ + (u'A List Apart', u'http://www.alistapart.com/site/rss'), + ] diff --git a/recipes/metro_uk.recipe b/recipes/metro_uk.recipe index fa5d5c19c8..5b7b3a64ed 100644 --- a/recipes/metro_uk.recipe +++ b/recipes/metro_uk.recipe @@ -1,31 +1,42 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1306097511(BasicNewsRecipe): title = u'Metro UK' - description = 'News as provide by The Metro -UK' + description = 'Author Dave Asbury : News as provide by The Metro -UK' #timefmt = '' __author__ = 'Dave Asbury' - #last update 9/6/12 + #last update 4/8/12 cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg' - #no_stylesheets = True + no_stylesheets = True oldest_article = 1 - max_articles_per_feed = 10 + max_articles_per_feed = 12 remove_empty_feeds = True remove_javascript = True - auto_cleanup = True + #auto_cleanup = True encoding = 'UTF-8' - + cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/157897_117118184990145_840702264_n.jpg' language = 'en_GB' masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif' + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:1.6em;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:1.2em;} + p{font-family:Arial,Helvetica,sans-serif;font-size:1.0em;} + body{font-family:Helvetica,Arial,sans-serif;font-size:1.0em;} + ''' keep_only_tags = [ - - ] + #dict(name='h1'), + #dict(name='h2'), + #dict(name='div', attrs={'class' : ['row','article','img-cnt figure','clrd']}) + #dict(name='h3'), + #dict(attrs={'class' : 'BText'}), + ] remove_tags = [ - + dict(name='span',attrs={'class' : 'share'}), + dict(name='li'), + dict(attrs={'class' : ['twitter-share-button','header-forms','hdr-lnks','close','art-rgt','fd-gr1-b clrd google-article','news m12 clrd clr-b p5t shareBtm','item-ds csl-3-img news','c-1of3 c-last','c-1of1','pd','item-ds csl-3-img sport']}), + dict(attrs={'id' : ['','sky-left','sky-right','ftr-nav','and-ftr','notificationList','logo','miniLogo','comments-news','metro_extras']}) ] - + remove_tags_before = dict(name='h1') + #remove_tags_after = dict(attrs={'id':['topic-buttons']}) feeds = [ (u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')] - extra_css = ''' - body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} - ''' diff --git a/recipes/montreal_gazette.recipe b/recipes/montreal_gazette.recipe index 4ebbdbc0a1..49a5089b5c 100644 --- a/recipes/montreal_gazette.recipe +++ b/recipes/montreal_gazette.recipe @@ -1,48 +1,314 @@ #!/usr/bin/env python - +# -*- coding: utf-8 -*- __license__ = 'GPL v3' ''' www.canada.com ''' - +import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following three lines for the Montreal Gazette + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] + + + # un-comment the following six lines for the Vancouver Province +## title = u'Vancouver Province' +## url_prefix = 'http://www.theprovince.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' + + # un-comment the following six lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' +## fp_tag = 'CAN_VS' + + # un-comment the following six lines for the Calgary Herald +## title = u'Calgary Herald' +## url_prefix = 'http://www.calgaryherald.com' +## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' +## fp_tag = 'CAN_CH' + + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' + + # un-comment the following six lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' +## fp_tag = 'CAN_OC' + + # un-comment the following six lines for the Montreal Gazette title = u'Montreal Gazette' + url_prefix = 'http://www.montrealgazette.com' description = u'News from Montreal, QC' + std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' + logo_url = 'mglogo.jpg' + fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - auto_cleanup = True - auto_cleanup_keep = '//*[@id="imageBox"]' - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] - feeds = [ -('News', - 'http://rss.canada.com/get/?F297'), - ('Sports', - 'http://rss.canada.com/get/?F299'), - ('Entertainment', - 'http://rss.canada.com/get/?F7366'), - ('Business', - 'http://rss.canada.com/get/?F6939'), -] + def get_cover_url(self): + from datetime import timedelta, date + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup - + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() + return self.strip_anchors(soup) + + + + def parse_index(self): + + articles = {} + ans = [] + + + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans + diff --git a/recipes/natemat_pl.recipe b/recipes/natemat_pl.recipe index faa1b341a0..d6db93dad7 100644 --- a/recipes/natemat_pl.recipe +++ b/recipes/natemat_pl.recipe @@ -1,3 +1,4 @@ +import re from calibre.web.feeds.news import BasicNewsRecipe class NaTemat(BasicNewsRecipe): @@ -8,8 +9,9 @@ class NaTemat(BasicNewsRecipe): description = u'informacje, komentarze, opinie' category = 'news' language = 'pl' + preprocess_regexps = [(re.compile(ur'Czytaj też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Zobacz też\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj więcej\:.*?', re.IGNORECASE), lambda m: ''), (re.compile(ur'Czytaj również\:.*?', re.IGNORECASE), lambda m: '')] cover_url= 'http://blog.plona.pl/wp-content/uploads/2012/05/natemat.png' no_stylesheets = True keep_only_tags= [dict(id='main')] - remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related']})] + remove_tags= [dict(attrs={'class':['button', 'block-inside style_default', 'article-related', 'user-header', 'links']}), dict(name='img', attrs={'class':'indent'})] feeds = [(u'Artyku\u0142y', u'http://natemat.pl/rss/wszystkie')] diff --git a/recipes/ottawa_citizen.recipe b/recipes/ottawa_citizen.recipe index 32d5567d6d..0245b65231 100644 --- a/recipes/ottawa_citizen.recipe +++ b/recipes/ottawa_citizen.recipe @@ -1,105 +1,136 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] - # un-comment the following four lines for the Vancouver Province + + # un-comment the following six lines for the Vancouver Province ## title = u'Vancouver Province' ## url_prefix = 'http://www.theprovince.com' ## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' - # un-comment the following four lines for the Vancouver Sun + # un-comment the following six lines for the Vancouver Sun ## title = u'Vancouver Sun' ## url_prefix = 'http://www.vancouversun.com' ## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' ## fp_tag = 'CAN_VS' - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald + # un-comment the following six lines for the Calgary Herald ## title = u'Calgary Herald' ## url_prefix = 'http://www.calgaryherald.com' ## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' ## fp_tag = 'CAN_CH' - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen + # un-comment the following six lines for the Ottawa Citizen title = u'Ottawa Citizen' url_prefix = 'http://www.ottawacitizen.com' description = u'News from Ottawa, ON' + std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' + logo_url = 'oclogo.jpg' fp_tag = 'CAN_OC' - # un-comment the following four lines for the Montreal Gazette + # un-comment the following six lines for the Montreal Gazette ## title = u'Montreal Gazette' ## url_prefix = 'http://www.montrealgazette.com' ## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' ## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + def get_cover_url(self): from datetime import timedelta, date - if self.fp_tag=='': - return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser() daysback=1 @@ -120,6 +151,18 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +209,106 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() return self.strip_anchors(soup) def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/recipes/phillosophy_now.recipe b/recipes/phillosophy_now.recipe new file mode 100644 index 0000000000..7c12832c70 --- /dev/null +++ b/recipes/phillosophy_now.recipe @@ -0,0 +1,75 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict + +class PhilosophyNow(BasicNewsRecipe): + + title = 'Philosophy Now' + __author__ = 'Rick Shang' + + description = '''Philosophy Now is a lively magazine for everyone + interested in ideas. It isn't afraid to tackle all the major questions of + life, the universe and everything. Published every two months, it tries to + corrupt innocent citizens by convincing them that philosophy can be + exciting, worthwhile and comprehensible, and also to provide some enjoyable + reading matter for those already ensnared by the muse, such as philosophy + students and academics.''' + language = 'en' + category = 'news' + encoding = 'UTF-8' + + keep_only_tags = [dict(attrs={'id':'fullMainColumn'})] + remove_tags = [dict(attrs={'class':'articleTools'})] + no_javascript = True + no_stylesheets = True + needs_subscription = True + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open('https://philosophynow.org/auth/login') + br.select_form(nr = 1) + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + + def parse_index(self): + #Go to the issue + soup0 = self.index_to_soup('http://philosophynow.org/') + issue = soup0.find('div',attrs={'id':'navColumn'}) + + #Find date & cover + cover = issue.find('div', attrs={'id':'cover'}) + date = self.tag_to_string(cover.find('h3')).strip() + self.timefmt = u' [%s]'%date + img=cover.find('img',src=True)['src'] + self.cover_url = 'http://philosophynow.org' + re.sub('medium','large',img) + issuenum = re.sub('/media/images/covers/medium/issue','',img) + issuenum = re.sub('.jpg','',issuenum) + + #Go to the main body + current_issue_url = 'http://philosophynow.org/issues/' + issuenum + soup = self.index_to_soup(current_issue_url) + div = soup.find ('div', attrs={'class':'articlesColumn'}) + + feeds = OrderedDict() + + for post in div.findAll('h3'): + articles = [] + a=post.find('a',href=True) + if a is not None: + url="http://philosophynow.org" + a['href'] + title=self.tag_to_string(a).strip() + s=post.findPrevious('h4') + section_title = self.tag_to_string(s).strip() + d=post.findNext('p') + desc = self.tag_to_string(d).strip() + articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans + diff --git a/recipes/psych.recipe b/recipes/psych.recipe index 3fc940b4a2..a21acefe30 100644 --- a/recipes/psych.recipe +++ b/recipes/psych.recipe @@ -1,44 +1,79 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ptempfile import PersistentTemporaryFile -from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1275708473(BasicNewsRecipe): - title = u'Psychology Today' - _author__ = 'rty' - publisher = u'www.psychologytoday.com' - category = u'Psychology' - max_articles_per_feed = 100 - remove_javascript = True - use_embedded_content = False - no_stylesheets = True +class PsychologyToday(BasicNewsRecipe): + + title = 'Psychology Today' + __author__ = 'Rick Shang' + + description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.' language = 'en' - temp_files = [] - articles_are_obfuscated = True - remove_tags = [ - dict(name='div', attrs={'class':['print-source_url','field-items','print-footer']}), - dict(name='span', attrs={'class':'print-footnote'}), - ] - remove_tags_before = dict(name='h1', attrs={'class':'print-title'}) - remove_tags_after = dict(name='div', attrs={'class':['field-items','print-footer']}) + category = 'news' + encoding = 'UTF-8' + keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})] + no_javascript = True + no_stylesheets = True - feeds = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')] - def get_article_url(self, article): - return article.get('link', None) + def parse_index(self): + articles = [] + soup = self.index_to_soup('http://www.psychologytoday.com/magazine') + + + #Go to the main body + div = soup.find('div',attrs={'id':'content-content'}) + #Find cover & date + cover_item = div.find('div', attrs={'class':'collections-header-image'}) + cover = cover_item.find('img',src=True) + self.cover_url = cover['src'] + date = self.tag_to_string(cover['title']) + self.timefmt = u' [%s]'%date + + articles = [] + for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) + title = title + u' (%s)'%author + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + description = post.find('div', attrs={'class':'collection-node-description'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip()) + desc = self.tag_to_string(description).strip() + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + title = title + u' (%s)'%author + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}): + title = self.tag_to_string(post.find('h2')) + author_item=post.find('div', attrs={'class':'collection-node-byline'}) + author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) + title = title + u' (%s)'%author + article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) + print_page=article_page.find('li', attrs={'class':'print_html first'}) + url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] + desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'','description':desc}) + + return [('Current Issue', articles)] - def get_obfuscated_article(self, url): - br = self.get_browser() - br.open(url) - response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0) - html = response.read() - self.temp_files.append(PersistentTemporaryFile('_fa.html')) - self.temp_files[-1].write(html) - self.temp_files[-1].close() - return self.temp_files[-1].name - def get_cover_url(self): - index = 'http://www.psychologytoday.com/magazine/' - soup = self.index_to_soup(index) - for image in soup.findAll('img',{ "class" : "imagefield imagefield-field_magazine_cover" }): - return image['src'] + '.jpg' - return None diff --git a/recipes/sfbg.recipe b/recipes/sfbg.recipe index 0735e760c6..5c77c96f74 100644 --- a/recipes/sfbg.recipe +++ b/recipes/sfbg.recipe @@ -1,25 +1,35 @@ from calibre.web.feeds.news import BasicNewsRecipe class SanFranciscoBayGuardian(BasicNewsRecipe): - title = u'San Francisco Bay Guardian' - language = 'en' - __author__ = 'Krittika Goyal' + title = u'San Francisco Bay Guardian' + language = 'en' + __author__ = 'Krittika Goyal' oldest_article = 31 #days max_articles_per_feed = 25 + #encoding = 'latin1' no_stylesheets = True + #remove_tags_before = dict(name='div', attrs={'id':'story_header'}) + #remove_tags_after = dict(name='div', attrs={'id':'shirttail'}) remove_tags = [ - dict(name='iframe'), + dict(name='iframe'), + #dict(name='div', attrs={'class':'related-articles'}), + #dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}), + #dict(name='ul', attrs={'class':'article-tools'}), + #dict(name='ul', attrs={'id':'story_tabs'}), ] feeds = [ ('sfbg', 'http://www.sfbg.com/rss.xml'), - ('politics', 'http://www.sfbg.com/politics/rss.xml'), - ('blogs', 'http://www.sfbg.com/blog/rss.xml'), - ('pixel_vision', 'http://www.sfbg.com/pixel_vision/rss.xml'), - ('bruce', 'http://www.sfbg.com/bruce/rss.xml'), ] - + #def preprocess_html(self, soup): + #story = soup.find(name='div', attrs={'id':'story_body'}) + #td = heading.findParent(name='td') + #td.extract() + #soup = BeautifulSoup('t') + #body = soup.find(name='body') + #body.insert(0, story) + #return soup diff --git a/recipes/slashdot.recipe b/recipes/slashdot.recipe index b10700a749..577582ee70 100644 --- a/recipes/slashdot.recipe +++ b/recipes/slashdot.recipe @@ -19,23 +19,12 @@ class Slashdot(BasicNewsRecipe): __author__ = 'floweros edited by Huan T' no_stylesheets = True + use_embedded_content = False keep_only_tags = [ - dict(name='div',attrs={'id':'article'}), - dict(name='div',attrs={'class':['postBody' 'details']}), - dict(name='footer',attrs={'class':['clearfix meta article-foot']}), - dict(name='article',attrs={'class':['fhitem fhitem-story article usermode thumbs grid_24']}), - dict(name='dl',attrs={'class':'relatedPosts'}), - dict(name='h2',attrs={'class':'story'}), - dict(name='span',attrs={'class':'comments'}), - ] - - - remove_tags = [ - dict(name='aside',attrs={'id':'slashboxes'}), - dict(name='div',attrs={'class':'paginate'}), - dict(name='section',attrs={'id':'comments'}), - dict(name='span',attrs={'class':'topic'}), - ] + dict(name='div',attrs={'class':'story'}), + dict(name='div',attrs={'class':'body'}), + dict(name='ul',attrs={'id':'commentlisting'}), + ] feeds = [ (u'Slashdot', diff --git a/recipes/smashing.recipe b/recipes/smashing.recipe index 04436a05ef..bc24166275 100644 --- a/recipes/smashing.recipe +++ b/recipes/smashing.recipe @@ -1,50 +1,24 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' -''' -www.smashingmagazine.com -''' - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from calibre.web.feeds.news import BasicNewsRecipe -class SmashingMagazine(BasicNewsRecipe): - title = 'Smashing Magazine' - __author__ = 'Darko Miletic' - description = 'We smash you with the information that will make your life easier, really' - oldest_article = 20 - language = 'en' - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - publisher = 'Smashing Magazine' - category = 'news, web, IT, css, javascript, html' - encoding = 'utf-8' +class SmashingMagazine (BasicNewsRecipe): + __author__ = u'Marc Busqué ' + __url__ = 'http://www.lamarciana.com' + __version__ = '1.0.1' + __license__ = 'GPL v3' + __copyright__ = u'2012, Marc Busqué ' + title = u'Smashing Magazine' + description = u'Founded in September 2006, Smashing Magazine delivers useful and innovative information to Web designers and developers. Our aim is to inform our readers about the latest trends and techniques in Web development. We try to persuade you not with the quantity but with the quality of the information we present. Smashing Magazine is and always has been independent.' + language = 'en' + tags = 'web development, software' + oldest_article = 7 + remove_empty_feeds = True + no_stylesheets = True + encoding = 'utf8' + cover_url = u'http://media.smashingmagazine.com/themes/smashingv4/images/logo.png' + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 'valign', 'vspace', 'hspace', 'alt', 'width', 'height', 'style'] + extra_css = u'body div table:first-child {display: none;} img {max-width: 100%; display: block; margin: auto;}' - conversion_options = { - 'comments' : description - ,'tags' : category - ,'publisher' : publisher - } - - keep_only_tags = [dict(name='div', attrs={'id':'leftcolumn'})] - remove_tags_after = dict(name='ul',attrs={'class':'social'}) - remove_tags = [ - dict(name=['link','object']) - ,dict(name='h1',attrs={'class':'logo'}) - ,dict(name='div',attrs={'id':'booklogosec'}) - ,dict(attrs={'src':'http://media2.smashingmagazine.com/wp-content/uploads/images/the-smashing-book/smbook6.gif'}) - ] - - feeds = [(u'Articles', u'http://rss1.smashingmagazine.com/feed/')] - - def preprocess_html(self, soup): - for iter in soup.findAll('div',attrs={'class':'leftframe'}): - it = iter.find('h1') - if it == None: - iter.extract() - for item in soup.findAll('img'): - oldParent = item.parent - if oldParent.name == 'a': - oldParent.name = 'div' - return soup + feeds = [ + (u'Smashing Magazine', u'http://rss1.smashingmagazine.com/feed/'), + ] diff --git a/recipes/smith.recipe b/recipes/smith.recipe index 8bf60a227a..3d6a95c494 100644 --- a/recipes/smith.recipe +++ b/recipes/smith.recipe @@ -1,61 +1,67 @@ import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict -class SmithsonianMagazine(BasicNewsRecipe): - title = u'Smithsonian Magazine' - language = 'en' - __author__ = 'Krittika Goyal and TerminalVeracity' - oldest_article = 31#days - max_articles_per_feed = 50 - use_embedded_content = False - recursions = 1 - cover_url = 'http://sphotos.xx.fbcdn.net/hphotos-snc7/431147_10150602715983253_764313347_n.jpg' - match_regexps = ['&page=[2-9]$'] - preprocess_regexps = [ - (re.compile(r'for more of Smithsonian\'s coverage on history, science and nature.', re.DOTALL), lambda m: '') - ] - extra_css = """ - h1{font-size: large; margin: .2em 0} - h2{font-size: medium; margin: .2em 0} - h3{font-size: medium; margin: .2em 0} - #byLine{margin: .2em 0} - .articleImageCaptionwide{font-style: italic} - .wp-caption-text{font-style: italic} - img{display: block} - """ +class Smithsonian(BasicNewsRecipe): + title = 'Smithsonian Magazine' + __author__ = 'Rick Shang' - remove_stylesheets = True - remove_tags_after = dict(name='div', attrs={'class':['post','articlePaginationWrapper']}) - remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':['article_sidebar_border','viewMorePhotos','addtoany_share_save_container','meta','social','OUTBRAIN','related-articles-inpage']}), - dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large','comment_section','article-related']}), - dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}), - dict(name='h4', attrs={'id':'related-topics'}), - dict(name='table'), - dict(name='a', attrs={'href':['/subArticleBottomWeb','/subArticleTopWeb','/subArticleTopMag','/subArticleBottomMag']}), - dict(name='a', attrs={'name':'comments_shaded'}), - ] + description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' + language = 'en' + category = 'news' + encoding = 'UTF-8' + keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})] + remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})] + no_javascript = True + no_stylesheets = True + def parse_index(self): + #Go to the issue + soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/') + div = soup0.find('div',attrs={'id':'archives'}) + issue = div.find('ul',attrs={'class':'clear-both'}) + current_issue_url = issue.find('a', href=True)['href'] + soup = self.index_to_soup(current_issue_url) - feeds = [ -('History and Archeology', - 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'), -('People and Places', - 'http://feeds.feedburner.com/smithsonianmag/people-places'), -('Science and Nature', - 'http://feeds.feedburner.com/smithsonianmag/science-nature'), -('Arts and Culture', - 'http://feeds.feedburner.com/smithsonianmag/arts-culture'), -('Travel', - 'http://feeds.feedburner.com/smithsonianmag/travel'), -] + #Go to the main body + div = soup.find ('div', attrs={'id':'content-inset'}) + + #Find date + date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip()) + self.timefmt = u' [%s]'%date + + #Find cover + self.cover_url = div.find('img',src=True)['src'] + + feeds = OrderedDict() + section_title = '' + subsection_title = '' + for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}): + articles = [] + prefix = '' + h3=post.find('h3') + if h3 is not None: + section_title = self.tag_to_string(h3) + else: + subsection=post.find('p',attrs={'class':'article-cat'}) + link=post.find('a',href=True) + url=link['href']+'?c=y&story=fullstory' + if subsection is not None: + subsection_title = self.tag_to_string(subsection) + prefix = (subsection_title+': ') + description=self.tag_to_string(post('p', limit=2)[1]).strip() + else: + description=self.tag_to_string(post.find('p')).strip() + desc=re.sub('\sBy\s.*', '', description, re.DOTALL) + author=re.sub('.*By\s', '', description, re.DOTALL) + title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author + articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'id':'article-body'}) - soup = BeautifulSoup('t') - body = soup.find(name='body') - body.insert(0, story) - return soup diff --git a/recipes/sueddeutsche_mobil.recipe b/recipes/sueddeutsche_mobil.recipe new file mode 100644 index 0000000000..d1b08cbcba --- /dev/null +++ b/recipes/sueddeutsche_mobil.recipe @@ -0,0 +1,117 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +__license__ = 'GPL v3' +__copyright__ = '2012, Andreas Zeiser ' +''' +szmobil.sueddeutsche.de/ +''' + +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class SZmobil(BasicNewsRecipe): + title = u'Süddeutsche Zeitung mobil' + __author__ = u'Andreas Zeiser' + description = u'Nachrichten aus Deutschland. Zugriff auf kostenpflichtiges Abo SZ mobil.' + publisher = u'Sueddeutsche Zeitung' + language = u'de' + publication_type = u'newspaper' + category = u'news, politics, Germany' + + no_stylesheets = True + oldest_article = 2 + encoding = 'iso-8859-1' + needs_subscription = True + remove_empty_feeds = True + delay = 1 + cover_source = 'http://www.sueddeutsche.de/verlag' + + timefmt = ' [%a, %d %b, %Y]' + + root_url ='http://szmobil.sueddeutsche.de/' + keep_only_tags = [dict(name='div', attrs={'class':'article'})] + + def get_cover_url(self): + src = self.index_to_soup(self.cover_source) + image_url = src.find(attrs={'class':'preview-image'}) + return image_url.div.img['src'] + + def get_browser(self): + browser = BasicNewsRecipe.get_browser(self) + + # Login via fetching of Streiflicht -> Fill out login request + url = self.root_url + 'show.php?id=streif' + browser.open(url) + + browser.select_form(nr=0) # to select the first form + browser['username'] = self.username + browser['password'] = self.password + browser.submit() + + return browser + + def parse_index(self): + # find all sections + src = self.index_to_soup('http://szmobil.sueddeutsche.de') + feeds = [] + for itt in src.findAll('a',href=True): + if itt['href'].startswith('show.php?section'): + feeds.append( (itt.string[0:-2],itt['href']) ) + + all_articles = [] + for feed in feeds: + feed_url = self.root_url + feed[1] + feed_title = feed[0] + + self.report_progress(0, ('Fetching feed')+' %s...'%(feed_title if feed_title else feed_url)) + + src = self.index_to_soup(feed_url) + articles = [] + shorttitles = dict() + for itt in src.findAll('a', href=True): + if itt['href'].startswith('show.php?id='): + article_url = itt['href'] + article_id = int(re.search("id=(\d*)&etag=", itt['href']).group(1)) + + # first check if link is a special article in section "Meinungsseite" + if itt.find('strong')!= None: + article_name = itt.strong.string + article_shorttitle = itt.contents[1] + + articles.append( (article_name, article_url, article_id) ) + shorttitles[article_id] = article_shorttitle + continue + + + # candidate for a general article + if itt.string == None: + article_name = '' + else: + article_name = itt.string + + if (article_name[0:10] == " mehr"): + # just another link ("mehr") to an article + continue + + if itt.has_key('id'): + shorttitles[article_id] = article_name + else: + articles.append( (article_name, article_url, article_id) ) + + feed_articles = [] + for article_name, article_url, article_id in articles: + url = self.root_url + article_url + title = article_name + pubdate = strftime('%a, %d %b') + description = '' + if shorttitles.has_key(article_id): + description = shorttitles[article_id] + # we do not want the flag ("Impressum") + if "HERAUSGEGEBEN VOM" in description: + continue + d = dict(title=title, url=url, date=pubdate, description=description, content='') + feed_articles.append(d) + all_articles.append( (feed_title, feed_articles) ) + + return all_articles + diff --git a/recipes/the_new_republic.recipe b/recipes/the_new_republic.recipe index 59ccef3607..057b898f42 100644 --- a/recipes/the_new_republic.recipe +++ b/recipes/the_new_republic.recipe @@ -1,45 +1,68 @@ -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from collections import OrderedDict -class The_New_Republic(BasicNewsRecipe): - title = 'The New Republic' - __author__ = 'cix3' +class TNR(BasicNewsRecipe): + + title = 'The New Republic' + __author__ = 'Rick Shang' + + description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.' language = 'en' - description = 'Intelligent, stimulating and rigorous examination of American politics, foreign policy and culture' - timefmt = ' [%b %d, %Y]' - - oldest_article = 7 - max_articles_per_feed = 100 + category = 'news' + encoding = 'UTF-8' + remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})] + no_javascript = True no_stylesheets = True - remove_tags = [ - dict(name='div', attrs={'class':['print-logo', 'print-site_name', 'img-left', 'print-source_url']}), - dict(name='hr', attrs={'class':'print-hr'}), dict(name='img') - ] - feeds = [ - ('Politics', 'http://www.tnr.com/rss/articles/Politics'), - ('Books and Arts', 'http://www.tnr.com/rss/articles/Books-and-Arts'), - ('Economy', 'http://www.tnr.com/rss/articles/Economy'), - ('Environment and Energy', 'http://www.tnr.com/rss/articles/Environment-%2526-Energy'), - ('Health Care', 'http://www.tnr.com/rss/articles/Health-Care'), - ('Metro Policy', 'http://www.tnr.com/rss/articles/Metro-Policy'), - ('World', 'http://www.tnr.com/rss/articles/World'), - ('Film', 'http://www.tnr.com/rss/articles/Film'), - ('Books', 'http://www.tnr.com/rss/articles/books'), - ('The Book', 'http://www.tnr.com/rss/book'), - ('Jonathan Chait', 'http://www.tnr.com/rss/blogs/Jonathan-Chait'), - ('The Plank', 'http://www.tnr.com/rss/blogs/The-Plank'), - ('The Treatment', 'http://www.tnr.com/rss/blogs/The-Treatment'), - ('The Spine', 'http://www.tnr.com/rss/blogs/The-Spine'), - ('The Vine', 'http://www.tnr.com/rss/blogs/The-Vine'), - ('The Avenue', 'http://www.tnr.com/rss/blogs/The-Avenue'), - ('William Galston', 'http://www.tnr.com/rss/blogs/William-Galston'), - ('Simon Johnson', 'http://www.tnr.com/rss/blogs/Simon-Johnson'), - ('Ed Kilgore', 'http://www.tnr.com/rss/blogs/Ed-Kilgore'), - ('Damon Linker', 'http://www.tnr.com/rss/blogs/Damon-Linker'), - ('John McWhorter', 'http://www.tnr.com/rss/blogs/John-McWhorter') - ] + def parse_index(self): - def print_version(self, url): - return url.replace('http://www.tnr.com/', 'http://www.tnr.com/print/') + #Go to the issue + soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues') + issue = soup0.find('div',attrs={'id':'current_issue'}) + #Find date + date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip() + self.timefmt = u' [%s]'%date + + #Go to the main body + current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href'] + soup = self.index_to_soup(current_issue_url) + div = soup.find ('div', attrs={'class':'article_detail_body'}) + + + + #Find cover + self.cover_url = div.find('img',src=True)['src'] + + feeds = OrderedDict() + section_title = '' + subsection_title = '' + for post in div.findAll('p'): + articles = [] + em=post.find('em') + b=post.find('b') + a=post.find('a',href=True) + p=post.find('img', src=True) + #Find cover + if p is not None: + self.cover_url = p['src'].strip() + if em is not None: + section_title = self.tag_to_string(em).strip() + subsection_title = '' + elif b is not None: + subsection_title=self.tag_to_string(b).strip() + elif a is not None: + prefix = (subsection_title+': ') if subsection_title else '' + url=re.sub('www.tnr.com','www.tnr.com/print', a['href']) + author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL) + title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author + articles.append({'title':title, 'url':url, 'description':'', 'date':''}) + + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.iteritems()] + return ans diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe index 11500430ff..d93ac2c49b 100644 --- a/recipes/the_sun.recipe +++ b/recipes/the_sun.recipe @@ -1,4 +1,4 @@ -import re, random +import random from calibre import browser from calibre.web.feeds.recipes import BasicNewsRecipe @@ -6,48 +6,45 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'The Sun UK' - description = 'A Recipe for The Sun tabloid UK' + description = 'Articles from The Sun tabloid UK' __author__ = 'Dave Asbury' - # last updated 29/4/12 + # last updated 25/7/12 language = 'en_GB' oldest_article = 1 - max_articles_per_feed = 15 + max_articles_per_feed = 12 remove_empty_feeds = True no_stylesheets = True - #auto_cleanup = True - #articles_are_obfuscated = True + masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif' encoding = 'UTF-8' - - remove_empty_feeds = True remove_javascript = True no_stylesheets = True + + + #preprocess_regexps = [ + # (re.compile(r'