Merge from trunk
271
Changelog.yaml
@ -5,7 +5,7 @@
|
|||||||
# Also, each release can have new and improved recipes.
|
# Also, each release can have new and improved recipes.
|
||||||
|
|
||||||
# - version: ?.?.?
|
# - version: ?.?.?
|
||||||
# date: 2012-??-??
|
# date: 2013-??-??
|
||||||
#
|
#
|
||||||
# new features:
|
# new features:
|
||||||
# - title:
|
# - title:
|
||||||
@ -19,6 +19,275 @@
|
|||||||
# new recipes:
|
# new recipes:
|
||||||
# - title:
|
# - title:
|
||||||
|
|
||||||
|
- version: 0.9.14
|
||||||
|
date: 2013-01-11
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "When adding multiple books and duplicates are found, allow the user to select which of the duplicate books will be added anyway."
|
||||||
|
tickets: [1095256]
|
||||||
|
|
||||||
|
- title: "Device drivers for Kobo Arc on linux, Polaroid Android tablet"
|
||||||
|
tickets: [1098049]
|
||||||
|
|
||||||
|
- title: "When sorting by series, use the language of the book to decide what leading articles to remove, just as is done for sorting by title"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "PDF Output: Do not error out when the input document contains links with anchors not present in the document."
|
||||||
|
tickets: [1096428]
|
||||||
|
|
||||||
|
- title: "Add support for upgraded db on newest Kobo firmware"
|
||||||
|
tickets: [1095617]
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix typo that broke use of custom paper sizes."
|
||||||
|
tickets: [1097563]
|
||||||
|
|
||||||
|
- title: "PDF Output: Handle empty anchors present at the end of a page"
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix side margins of last page in a flow being incorrect when large side margins are used."
|
||||||
|
tickets: [1096290]
|
||||||
|
|
||||||
|
- title: "Edit metadata dialog: Allow setting the series number for custom series type columns to zero"
|
||||||
|
|
||||||
|
- title: "When bulk editing custom series-type columns and not provding a series number use 1 as the default, instead of None"
|
||||||
|
|
||||||
|
- title: "Catalogs: Fix issue with catalog generation using Hungarian UI and author_sort beginning with multiple letter groups."
|
||||||
|
tickets: [1091581]
|
||||||
|
|
||||||
|
- title: "PDF Output: Dont error out on files that have invalid font-family declarations."
|
||||||
|
tickets: [1096279]
|
||||||
|
|
||||||
|
- title: "Do not load QRawFont at global level, to allow calibre installation on systems with missing dependencies"
|
||||||
|
tickets: [1096170]
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix cover not present in generated PDF files"
|
||||||
|
tickets: [1096098]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Sueddeutsche Zeitung mobil
|
||||||
|
- Boerse Online
|
||||||
|
- TidBits
|
||||||
|
- New York Review of Books
|
||||||
|
- Fleshbot
|
||||||
|
- Il Messaggero
|
||||||
|
- Libero
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Spectator Magazine, Oxford Mail and Outside Magazine
|
||||||
|
author: Krittika Goyal
|
||||||
|
|
||||||
|
- title: Libartes
|
||||||
|
author: Darko Miletic
|
||||||
|
|
||||||
|
- title: El Diplo
|
||||||
|
author: Tomas De Domenico
|
||||||
|
|
||||||
|
- version: 0.9.13
|
||||||
|
date: 2013-01-04
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Complete rewrite of the PDF Output engine, to support links and fix various bugs"
|
||||||
|
type: major
|
||||||
|
description: "calibre now has a new PDF output engine that supports links in the text. It also fixes various bugs, detailed below. In order to implement support for links and fix these bugs, the engine had to be completely rewritten, so there may be some regressions."
|
||||||
|
|
||||||
|
- title: "Show disabled device plugins in Preferences->Ignored Devices"
|
||||||
|
|
||||||
|
- title: "Get Books: Fix Smashwords, Google books and B&N stores. Add Nook UK store"
|
||||||
|
|
||||||
|
- title: "Allow series numbers lower than -100 for custom series columns."
|
||||||
|
tickets: [1094475]
|
||||||
|
|
||||||
|
- title: "Add mass storage driver for rockhip based android smart phones"
|
||||||
|
tickets: [1087809]
|
||||||
|
|
||||||
|
- title: "Add a clear ratings button to the edit metadata dialog"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "PDF Output: Fix custom page sizes not working on OS X"
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix embedding of many fonts not supported (note that embedding of OpenType fonts with Postscript outlines is still not supported on windows, though it is supported on other operating systems)"
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix crashes converting some books to PDF on OS X"
|
||||||
|
tickets: [1087688]
|
||||||
|
|
||||||
|
- title: "HTML Input: Handle entities inside href attributes when following the links in an HTML file."
|
||||||
|
tickets: [1094203]
|
||||||
|
|
||||||
|
- title: "Content server: Fix custom icons not used for sub categories"
|
||||||
|
tickets: [1095016]
|
||||||
|
|
||||||
|
- title: "Force use of non-unicode constants in compiled templates. Fixes a problem with regular expression character classes and probably other things."
|
||||||
|
|
||||||
|
- title: "Kobo driver: Do not error out if there are invalid dates in the device database"
|
||||||
|
tickets: [1094597]
|
||||||
|
|
||||||
|
- title: "Content server: Fix for non-unicode hostnames when using mDNS"
|
||||||
|
tickets: [1094063]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Today's Zaman
|
||||||
|
- The Economist
|
||||||
|
- Foreign Affairs
|
||||||
|
- New York Times
|
||||||
|
- Alternet
|
||||||
|
- Harper's Magazine
|
||||||
|
- La Stampa
|
||||||
|
|
||||||
|
- version: 0.9.12
|
||||||
|
date: 2012-12-28
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Drivers for Kibano e-reader and Slick ER-700-2"
|
||||||
|
tickets: [1093570, 1093732]
|
||||||
|
|
||||||
|
- title: "Add support for downloading metadata from Amazon Brazil."
|
||||||
|
tickets: [1092594]
|
||||||
|
|
||||||
|
- title: "Copy to library: Allow specifying the destination library by path."
|
||||||
|
tickets: [1093231]
|
||||||
|
|
||||||
|
- title: "When adding empty books, allow setting of the series for the new books. Also select the newly added book records after adding."
|
||||||
|
|
||||||
|
- title: "PDF Output: Add a checkbox to override the page size defined by the output profile. This allows you to specify a custom page size even if the output profile is not set to default."
|
||||||
|
|
||||||
|
- title: "Add usb ids for newer kindle fire to the linux mtp driver"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Linux: Temporarily redirect stdout to get rid of the annoying and pointless message about mtpz during libmtp initialization"
|
||||||
|
|
||||||
|
- title: "Fix multiple 'All column' coloring rules not being applied"
|
||||||
|
tickets: [1093574]
|
||||||
|
|
||||||
|
- title: "Use custom icons in the content server as well."
|
||||||
|
tickets: [1092098]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- La Voce
|
||||||
|
- Harpers Magazine (printed edition)
|
||||||
|
- Pajamas Media
|
||||||
|
- NSFW corp
|
||||||
|
- The Hindu
|
||||||
|
- Nikkei News
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Various Ukranian news sources
|
||||||
|
author: rpalyvoda
|
||||||
|
|
||||||
|
- version: 0.9.11
|
||||||
|
date: 2012-12-21
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Merry Christmas and Happy Holidays to all ☺"
|
||||||
|
|
||||||
|
- title: "When connecting to MTP devices such as the Kindle Fire HD or the Nook HD, speed up the process by ignoring some folders."
|
||||||
|
description: "calibre will now ignore folders for music, video, pictures, etc. when scanning the device. This can substantially speed up the connection process if you have thousands of non-ebook files on the device. The list of folders to be ignored can be customized by right clicking on the device icon in calibre and selecting 'Configure this device'."
|
||||||
|
|
||||||
|
- title: "Allow changing the icons for categories in the Tag Browser. Right click on a category and choose 'Change category icon'."
|
||||||
|
tickets: [1092098]
|
||||||
|
|
||||||
|
- title: "Allow setting the color of all columns with a single rule in Preferences->Look & Feel->Column Coloring"
|
||||||
|
|
||||||
|
- title: "MOBI: When reading metadata from mobi files, put the contents of the ASIN field into an identifier named mobi-asin. Note that this value is not used when downloading metadata as it is not possible to know which (country specific) amazon website the ASIN comes from."
|
||||||
|
tickets: [1090394]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Windows build: Fix a regression in 0.9.9 that caused calibre to not start on some windows system that were missing the VC.90 dlls (some older XP systems)"
|
||||||
|
|
||||||
|
- title: "Kobo driver: Workaround for invalid shelves created by bugs in the Kobo server"
|
||||||
|
tickets: [1091932]
|
||||||
|
|
||||||
|
- title: "Metadata download: Fix cover downloading from non-US amazon sites broken by a website change."
|
||||||
|
tickets: [1090765]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Le Devoir
|
||||||
|
- Nin online
|
||||||
|
- countryfile
|
||||||
|
- Birmingham Post
|
||||||
|
- The Independent
|
||||||
|
- Various Polish news sources
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: MobileBulgaria
|
||||||
|
author: Martin Tsanchev
|
||||||
|
|
||||||
|
- title: Various Polish news sources
|
||||||
|
author: fenuks
|
||||||
|
|
||||||
|
- version: 0.9.10
|
||||||
|
date: 2012-12-14
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Drivers for Nextbook Premium 8 se, HTC Desire X and Emerson EM 543"
|
||||||
|
tickets: [1088149, 1088112, 1087978]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Fix rich text delegate not working with Qt compiled in debug mode."
|
||||||
|
tickets: [1089011]
|
||||||
|
|
||||||
|
- title: "When deleting all books in the library, blank the book details panel"
|
||||||
|
|
||||||
|
- title: "Conversion: Fix malformed values in the bgcolor attribute causing conversion to abort"
|
||||||
|
|
||||||
|
- title: "Conversion: Fix heuristics applying incorrect style in some circumstances"
|
||||||
|
tickets: [1066507]
|
||||||
|
|
||||||
|
- title: "Possible fix for 64bit calibre not starting up on some Windows systems"
|
||||||
|
tickets: [1087816]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Sivil Dusunce
|
||||||
|
- Anchorage Daily News
|
||||||
|
- Le Monde
|
||||||
|
- Harpers
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Titanic
|
||||||
|
author: Krittika Goyal
|
||||||
|
|
||||||
|
- version: 0.9.9
|
||||||
|
date: 2012-12-07
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "64 bit build for windows"
|
||||||
|
type: major
|
||||||
|
description: "calibre now has a 64 bit version for windows, available at: http://calibre-ebook.com/download_windows64 The 64bit build is not limited to using only 3GB of RAM when converting large/complex documents. It may also be slightly faster for some tasks. You can have both the 32 bit and the 64 bit build installed at the same time, they will use the same libraries, plugins and settings."
|
||||||
|
|
||||||
|
- title: "Content server: Make the identifiers in each books metadata clickable."
|
||||||
|
tickets: [1085726]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "EPUB Input: Fix an infinite loop while trying to recover a damaged EPUB file."
|
||||||
|
tickets: [1086917]
|
||||||
|
|
||||||
|
- title: "KF8 Input: Fix handling of links in files that link to the obsolete <a name> tags instead of tags with an id attribute."
|
||||||
|
tickets: [1086705]
|
||||||
|
|
||||||
|
- title: "Conversion: Fix a bug in removal of invalid entries from the spine, where not all invalid entries were removed, causing conversion to fail."
|
||||||
|
tickets: [1086054]
|
||||||
|
|
||||||
|
- title: "KF8 Input: Ignore invalid flow references in the KF8 document instead of erroring out on them."
|
||||||
|
tickets: [1085306]
|
||||||
|
|
||||||
|
- title: "Fix command line output on linux systems with incorrect LANG/LC_TYPE env vars."
|
||||||
|
tickets: [1085103]
|
||||||
|
|
||||||
|
- title: "KF8 Input: Fix page breaks specified using the data-AmznPageBreak attribute being ignored by calibre."
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix custom size field not accepting fractional numbers as sizes"
|
||||||
|
|
||||||
|
- title: "Get Books: Update libre.de and publio for website changes"
|
||||||
|
|
||||||
|
- title: "Wireless driver: Increase timeout interval, and when allocating a random port try 9090 first"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- New York Times
|
||||||
|
- Weblogs SL
|
||||||
|
- Zaman Gazetesi
|
||||||
|
- Aksiyon Dergisi
|
||||||
|
- Endgadget
|
||||||
|
- Metro UK
|
||||||
|
- Heise Online
|
||||||
|
|
||||||
- version: 0.9.8
|
- version: 0.9.8
|
||||||
date: 2012-11-30
|
date: 2012-11-30
|
||||||
|
|
||||||
|
10
README
@ -1,7 +1,7 @@
|
|||||||
calibre is an e-book library manager. It can view, convert and catalog e-books \
|
calibre is an e-book library manager. It can view, convert and catalog e-books
|
||||||
in most of the major e-book formats. It can also talk to e-book reader \
|
in most of the major e-book formats. It can also talk to e-book reader
|
||||||
devices. It can go out to the internet and fetch metadata for your books. \
|
devices. It can go out to the internet and fetch metadata for your books.
|
||||||
It can download newspapers and convert them into e-books for convenient \
|
It can download newspapers and convert them into e-books for convenient
|
||||||
reading. It is cross platform, running on Linux, Windows and OS X.
|
reading. It is cross platform, running on Linux, Windows and OS X.
|
||||||
|
|
||||||
For screenshots: https://calibre-ebook.com/demo
|
For screenshots: https://calibre-ebook.com/demo
|
||||||
@ -15,5 +15,5 @@ bzr branch lp:calibre
|
|||||||
To update your copy of the source code:
|
To update your copy of the source code:
|
||||||
bzr merge
|
bzr merge
|
||||||
|
|
||||||
Tarballs of the source code for each release are now available \
|
Tarballs of the source code for each release are now available
|
||||||
at http://code.google.com/p/calibre-ebook
|
at http://code.google.com/p/calibre-ebook
|
||||||
|
@ -49,7 +49,7 @@ All the |app| python code is in the ``calibre`` package. This package contains t
|
|||||||
* Metadata reading, writing, and downloading is all in ebooks.metadata
|
* Metadata reading, writing, and downloading is all in ebooks.metadata
|
||||||
* Conversion happens in a pipeline, for the structure of the pipeline,
|
* Conversion happens in a pipeline, for the structure of the pipeline,
|
||||||
see :ref:`conversion-introduction`. The pipeline consists of an input
|
see :ref:`conversion-introduction`. The pipeline consists of an input
|
||||||
plugin, various transforms and an output plugin. The code constructs
|
plugin, various transforms and an output plugin. The that code constructs
|
||||||
and drives the pipeline is in plumber.py. The pipeline works on a
|
and drives the pipeline is in plumber.py. The pipeline works on a
|
||||||
representation of an ebook that is like an unzipped epub, with
|
representation of an ebook that is like an unzipped epub, with
|
||||||
manifest, spine, toc, guide, html content, etc. The
|
manifest, spine, toc, guide, html content, etc. The
|
||||||
@ -74,10 +74,6 @@ After installing Bazaar, you can get the |app| source code with the command::
|
|||||||
|
|
||||||
On Windows you will need the complete path name, that will be something like :file:`C:\\Program Files\\Bazaar\\bzr.exe`.
|
On Windows you will need the complete path name, that will be something like :file:`C:\\Program Files\\Bazaar\\bzr.exe`.
|
||||||
|
|
||||||
To update a branch to the latest code, use the command::
|
|
||||||
|
|
||||||
bzr merge
|
|
||||||
|
|
||||||
|app| is a very large project with a very long source control history, so the
|
|app| is a very large project with a very long source control history, so the
|
||||||
above can take a while (10mins to an hour depending on your internet speed).
|
above can take a while (10mins to an hour depending on your internet speed).
|
||||||
|
|
||||||
@ -88,6 +84,11 @@ using::
|
|||||||
|
|
||||||
bzr branch --stacked lp:calibre
|
bzr branch --stacked lp:calibre
|
||||||
|
|
||||||
|
|
||||||
|
To update a branch to the latest code, use the command::
|
||||||
|
|
||||||
|
bzr merge
|
||||||
|
|
||||||
Submitting your changes to be included
|
Submitting your changes to be included
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
@ -162,7 +162,8 @@ Follow these steps to find the problem:
|
|||||||
* If you are connecting an Apple iDevice (iPad, iPod Touch, iPhone), use the 'Connect to iTunes' method in the 'Getting started' instructions in `Calibre + Apple iDevices: Start here <http://www.mobileread.com/forums/showthread.php?t=118559>`_.
|
* If you are connecting an Apple iDevice (iPad, iPod Touch, iPhone), use the 'Connect to iTunes' method in the 'Getting started' instructions in `Calibre + Apple iDevices: Start here <http://www.mobileread.com/forums/showthread.php?t=118559>`_.
|
||||||
* Make sure you are running the latest version of |app|. The latest version can always be downloaded from `the calibre website <http://calibre-ebook.com/download>`_.
|
* Make sure you are running the latest version of |app|. The latest version can always be downloaded from `the calibre website <http://calibre-ebook.com/download>`_.
|
||||||
* Ensure your operating system is seeing the device. That is, the device should show up in Windows Explorer (in Windows) or Finder (in OS X).
|
* Ensure your operating system is seeing the device. That is, the device should show up in Windows Explorer (in Windows) or Finder (in OS X).
|
||||||
* In |app|, go to Preferences->Plugins->Device Interface plugin and make sure the plugin for your device is enabled, the plugin icon next to it should be green when it is enabled.
|
* In |app|, go to Preferences->Ignored Devices and check that your device
|
||||||
|
is not being ignored
|
||||||
* If all the above steps fail, go to Preferences->Miscellaneous and click debug device detection with your device attached and post the output as a ticket on `the calibre bug tracker <http://bugs.calibre-ebook.com>`_.
|
* If all the above steps fail, go to Preferences->Miscellaneous and click debug device detection with your device attached and post the output as a ticket on `the calibre bug tracker <http://bugs.calibre-ebook.com>`_.
|
||||||
|
|
||||||
My device is non-standard or unusual. What can I do to connect to it?
|
My device is non-standard or unusual. What can I do to connect to it?
|
||||||
@ -436,10 +437,10 @@ that allows you to create collections on your Kindle from the |app| metadata. It
|
|||||||
|
|
||||||
.. note:: Amazon have removed the ability to manipulate collections completely in their newer models, like the Kindle Touch and Kindle Fire, making even the above plugin useless. If you really want the ability to manage collections on your Kindle via a USB connection, we encourage you to complain to Amazon about it, or get a reader where this is supported, like the SONY or Kobo Readers.
|
.. note:: Amazon have removed the ability to manipulate collections completely in their newer models, like the Kindle Touch and Kindle Fire, making even the above plugin useless. If you really want the ability to manage collections on your Kindle via a USB connection, we encourage you to complain to Amazon about it, or get a reader where this is supported, like the SONY or Kobo Readers.
|
||||||
|
|
||||||
I am getting an error when I try to use |app| with my Kobo Touch?
|
I am getting an error when I try to use |app| with my Kobo Touch/Glo/etc.?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
The Kobo Touch has very buggy firmware. Connecting to it has been known to fail at random. Certain combinations of motherboard, USB ports/cables/hubs can exacerbate this tendency to fail. If you are getting an error when connecting to your touch with |app| try the following, each of which has solved the problem for *some* |app| users.
|
The Kobo has very buggy firmware. Connecting to it has been known to fail at random. Certain combinations of motherboard, USB ports/cables/hubs can exacerbate this tendency to fail. If you are getting an error when connecting to your touch with |app| try the following, each of which has solved the problem for *some* |app| users.
|
||||||
|
|
||||||
* Connect the Kobo directly to your computer, not via USB Hub
|
* Connect the Kobo directly to your computer, not via USB Hub
|
||||||
* Try a different USB cable and a different USB port on your computer
|
* Try a different USB cable and a different USB port on your computer
|
||||||
@ -668,6 +669,22 @@ There are three possible things I know of, that can cause this:
|
|||||||
the blacklist of programs inside RoboForm to fix this. Or uninstall
|
the blacklist of programs inside RoboForm to fix this. Or uninstall
|
||||||
RoboForm.
|
RoboForm.
|
||||||
|
|
||||||
|
* The Logitech SetPoint Settings application causes random crashes in
|
||||||
|
|app| when it is open. Close it before starting |app|.
|
||||||
|
|
||||||
|
If none of the above apply to you, then there is some other program on your
|
||||||
|
computer that is interfering with |app|. First reboot your computer is safe
|
||||||
|
mode, to have as few running programs as possible, and see if the crashes still
|
||||||
|
happen. If they do not, then you know it is some program causing the problem.
|
||||||
|
The most likely such culprit is a program that modifies other programs'
|
||||||
|
behavior, such as an antivirus, a device driver, something like RoboForm (an
|
||||||
|
automatic form filling app) or an assistive technology like Voice Control or a
|
||||||
|
Screen Reader.
|
||||||
|
|
||||||
|
The only way to find the culprit is to eliminate the programs one by one and
|
||||||
|
see which one is causing the issue. Basically, stop a program, run calibre,
|
||||||
|
check for crashes. If they still happen, stop another program and repeat.
|
||||||
|
|
||||||
|app| is not starting on OS X?
|
|app| is not starting on OS X?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
@ -9,11 +9,12 @@ class Adventure_zone(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
oldest_article = 20
|
oldest_article = 20
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
|
||||||
index='http://www.adventure-zone.info/fusion/'
|
index='http://www.adventure-zone.info/fusion/'
|
||||||
use_embedded_content=False
|
use_embedded_content=False
|
||||||
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''),
|
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''),
|
||||||
(re.compile(r'\<table .*?\>'), lambda match: ''),
|
(re.compile(r'</?table.*?>'), lambda match: ''),
|
||||||
(re.compile(r'\<tbody\>'), lambda match: '')]
|
(re.compile(r'</?tbody.*?>'), lambda match: '')]
|
||||||
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
|
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
|
||||||
remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
|
remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
|
||||||
remove_tags_after= dict(id='comments')
|
remove_tags_after= dict(id='comments')
|
||||||
@ -36,11 +37,11 @@ class Adventure_zone(BasicNewsRecipe):
|
|||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
'''def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
|
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
|
||||||
cover=soup.find(id='box_OstatninumerAZ')
|
cover=soup.find(id='box_OstatninumerAZ')
|
||||||
self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src']
|
self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src']
|
||||||
return getattr(self, 'cover_url', self.cover_url)
|
return getattr(self, 'cover_url', self.cover_url)'''
|
||||||
|
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
|
@ -10,14 +10,12 @@ class Alternet(BasicNewsRecipe):
|
|||||||
category = 'News, Magazine'
|
category = 'News, Magazine'
|
||||||
description = 'News magazine and online community'
|
description = 'News magazine and online community'
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
|
(u'Front Page', u'http://feeds.feedblitz.com/alternet')
|
||||||
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
|
|
||||||
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
|
|
||||||
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
|
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_attributes = ['width', 'align','cellspacing']
|
remove_attributes = ['width', 'align','cellspacing']
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'UTF-8'
|
encoding = 'UTF-8'
|
||||||
|
@ -5,6 +5,8 @@ class AdvancedUserRecipe1278347258(BasicNewsRecipe):
|
|||||||
__author__ = 'rty'
|
__author__ = 'rty'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
|
||||||
feeds = [(u'Alaska News', u'http://www.adn.com/news/alaska/index.xml'),
|
feeds = [(u'Alaska News', u'http://www.adn.com/news/alaska/index.xml'),
|
||||||
(u'Business', u'http://www.adn.com/money/index.xml'),
|
(u'Business', u'http://www.adn.com/money/index.xml'),
|
||||||
@ -28,13 +30,13 @@ class AdvancedUserRecipe1278347258(BasicNewsRecipe):
|
|||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
masthead_url = 'http://media.adn.com/includes/assets/images/adn_logo.2.gif'
|
masthead_url = 'http://media.adn.com/includes/assets/images/adn_logo.2.gif'
|
||||||
|
|
||||||
keep_only_tags = [
|
#keep_only_tags = [
|
||||||
dict(name='div', attrs={'class':'left_col story_mainbar'}),
|
#dict(name='div', attrs={'class':'left_col story_mainbar'}),
|
||||||
]
|
#]
|
||||||
remove_tags = [
|
#remove_tags = [
|
||||||
dict(name='div', attrs={'class':'story_tools'}),
|
#dict(name='div', attrs={'class':'story_tools'}),
|
||||||
dict(name='p', attrs={'class':'ad_label'}),
|
#dict(name='p', attrs={'class':'ad_label'}),
|
||||||
]
|
#]
|
||||||
remove_tags_after = [
|
#remove_tags_after = [
|
||||||
dict(name='div', attrs={'class':'advertisement'}),
|
#dict(name='div', attrs={'class':'advertisement'}),
|
||||||
]
|
#]
|
||||||
|
@ -3,11 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class Android_com_pl(BasicNewsRecipe):
|
class Android_com_pl(BasicNewsRecipe):
|
||||||
title = u'Android.com.pl'
|
title = u'Android.com.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = 'Android.com.pl - biggest polish Android site'
|
description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.'
|
||||||
category = 'Android, mobile'
|
category = 'Android, mobile'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
use_embedded_content=True
|
use_embedded_content=True
|
||||||
cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png'
|
cover_url =u'http://android.com.pl/wp-content/themes/android/images/logo.png'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
feeds = [(u'Android', u'http://android.com.pl/component/content/frontpage/frontpage.feed?type=rss')]
|
feeds = [(u'Android', u'http://android.com.pl/feed/')]
|
||||||
|
19
recipes/astroflesz.recipe
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Astroflesz(BasicNewsRecipe):
|
||||||
|
title = u'Astroflesz'
|
||||||
|
oldest_article = 7
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne'
|
||||||
|
category = 'astronomy'
|
||||||
|
language = 'pl'
|
||||||
|
cover_url = 'http://www.astroflesz.pl/templates/astroflesz/images/logo/logo.png'
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
keep_only_tags = [dict(id="k2Container")]
|
||||||
|
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
|
||||||
|
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]
|
||||||
|
feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')]
|
@ -1,9 +1,11 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
import mechanize
|
||||||
|
|
||||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||||
title = u'Birmingham post'
|
title = u'Birmingham post'
|
||||||
description = 'Author D.Asbury. News for Birmingham UK'
|
description = 'Author D.Asbury. News for Birmingham UK'
|
||||||
#timefmt = ''
|
#timefmt = ''
|
||||||
# last update 8/9/12
|
|
||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
@ -15,8 +17,30 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
|||||||
#auto_cleanup = True
|
#auto_cleanup = True
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
|
|
||||||
|
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
||||||
|
|
||||||
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
|
masthead_url = 'http://www.trinitymirror.com/images/birminghampost-logo.gif'
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.birminghampost.net')
|
||||||
|
# look for the block containing the sun button and url
|
||||||
|
cov = soup.find(attrs={'height' : re.compile('3'), 'alt' : re.compile('Birmingham Post')})
|
||||||
|
print
|
||||||
|
print '%%%%%%%%%%%%%%%',cov
|
||||||
|
print
|
||||||
|
cov2 = str(cov['src'])
|
||||||
|
# cov2=cov2[7:]
|
||||||
|
print '88888888 ',cov2,' 888888888888'
|
||||||
|
|
||||||
|
#cover_url=cov2
|
||||||
|
#return cover_url
|
||||||
|
br = mechanize.Browser()
|
||||||
|
br.set_handle_redirect(False)
|
||||||
|
try:
|
||||||
|
br.open_novisit(cov2)
|
||||||
|
cover_url = cov2
|
||||||
|
except:
|
||||||
|
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
|
||||||
|
return cover_url
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
|
@ -2,32 +2,35 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
|||||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'Börse-online'
|
title = u'Börse-online'
|
||||||
__author__ = 'schuster'
|
__author__ = 'schuster, Armin Geller'
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'de'
|
language = 'de'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
cover_url = 'http://www.dpv.de/images/1995/source.gif'
|
encoding = 'iso-8859-1'
|
||||||
masthead_url = 'http://www.zeitschriften-cover.de/cover/boerse-online-cover-januar-2010-x1387.jpg'
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
extra_css = '''
|
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
|
||||||
h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
cover_url = 'http://www.wirtschaftsmedien-shop.de/s/media/coverimages/7576_2013107.jpg'
|
||||||
img {min-width:300px; max-width:600px; min-height:300px; max-height:800px}
|
masthead_url = 'http://upload.wikimedia.org/wikipedia/de/5/56/B%C3%B6rse_Online_Logo.svg'
|
||||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
remove_tags_after = [dict(name='div', attrs={'class':['artikelfuss', 'rahmen600']})]
|
||||||
'''
|
|
||||||
remove_tags_bevor = [dict(name='h3')]
|
remove_tags = [
|
||||||
remove_tags_after = [dict(name='div', attrs={'class':'artikelfuss'})]
|
dict(name='div', attrs={'id':['breadcrumb', 'rightCol', 'clearall']}),
|
||||||
remove_tags = [dict(attrs={'class':['moduleTopNav', 'moduleHeaderNav', 'text', 'blau', 'poll1150']}),
|
dict(name='div', attrs={'class':['footer', 'artikelfuss']}),
|
||||||
dict(id=['newsletterlayer', 'newsletterlayerClose', 'newsletterlayer_body', 'newsletterarray_error', 'newsletterlayer_emailadress', 'newsletterlayer_submit', 'kommentar']),
|
]
|
||||||
dict(name=['h2', 'Gesamtranking', 'h3',''])]
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'id':['contentWrapper']})
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'Börsennachrichten', u'http://www.boerse-online.de/rss/')]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url.replace('.html#nv=rss', '.html?mode=print')
|
return url.replace('.html#nv=rss', '.html?mode=print')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [(u'Börsennachrichten', u'http://www.boerse-online.de/rss/')]
|
|
||||||
|
|
||||||
|
@ -13,14 +13,13 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'id':'article_body_container'}),
|
dict(name='div', attrs={'id':'article_body_container'}),
|
||||||
]
|
]
|
||||||
remove_tags = [dict(name='ui'),dict(name='li')]
|
remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})]
|
||||||
no_javascript = True
|
no_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg'
|
cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg'
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
|
||||||
#Go to the issue
|
#Go to the issue
|
||||||
soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm')
|
soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm')
|
||||||
|
|
||||||
@ -47,7 +46,6 @@ class BusinessWeekMagazine(BasicNewsRecipe):
|
|||||||
if section_title not in feeds:
|
if section_title not in feeds:
|
||||||
feeds[section_title] = []
|
feeds[section_title] = []
|
||||||
feeds[section_title] += articles
|
feeds[section_title] += articles
|
||||||
|
|
||||||
div1 = soup.find ('div', attrs={'class':'column center'})
|
div1 = soup.find ('div', attrs={'class':'column center'})
|
||||||
section_title = ''
|
section_title = ''
|
||||||
for div in div1.findAll('h5'):
|
for div in div1.findAll('h5'):
|
||||||
|
@ -7,24 +7,29 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
|||||||
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
description = 'The official website of Countryfile Magazine'
|
description = 'The official website of Countryfile Magazine'
|
||||||
# last updated 7/10/12
|
# last updated 8/12/12
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
oldest_article = 30
|
oldest_article = 30
|
||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
#articles_are_obfuscated = True
|
#articles_are_obfuscated = True
|
||||||
ignore_duplicate_articles = {'title'}
|
#article_already_exists = False
|
||||||
|
#feed_hash = ''
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.countryfile.com/')
|
soup = self.index_to_soup('http://www.countryfile.com/magazine')
|
||||||
|
cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-250px_wide')})#'width' : '160',
|
||||||
|
print '&&&&&&&& ',cov,' ***'
|
||||||
|
cov=str(cov)
|
||||||
|
#cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
||||||
|
cov2 = re.findall('/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
||||||
|
|
||||||
|
cov2 = str(cov2)
|
||||||
|
cov2= "http://www.countryfile.com"+cov2[2:len(cov2)-8]
|
||||||
|
|
||||||
cov = soup.find(attrs={'width' : '160', 'class' : re.compile('imagecache imagecache-160px_wide')})
|
|
||||||
print '******** ',cov,' ***'
|
|
||||||
cov2 = str(cov)
|
|
||||||
cov2=cov2[10:101]
|
|
||||||
print '******** ',cov2,' ***'
|
print '******** ',cov2,' ***'
|
||||||
#cov2='http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/1b_0.jpg'
|
|
||||||
# try to get cover - if can't get known cover
|
# try to get cover - if can't get known cover
|
||||||
br = browser()
|
br = browser()
|
||||||
|
|
||||||
@ -45,5 +50,3 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
|||||||
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
|
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
20
recipes/czas_gentlemanow.recipe
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class CzasGentlemanow(BasicNewsRecipe):
|
||||||
|
title = u'Czas Gentlemanów'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Historia mężczyzn z dala od wielkiej polityki'
|
||||||
|
category = 'blog'
|
||||||
|
language = 'pl'
|
||||||
|
cover_url = 'http://czasgentlemanow.pl/wp-content/uploads/2012/10/logo-Czas-Gentlemanow1.jpg'
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
use_embedded_content = False
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'content'})]
|
||||||
|
remove_tags = [dict(attrs={'class':'meta_comments'})]
|
||||||
|
remove_tags_after = dict(name='div', attrs={'class':'fblikebutton_button'})
|
||||||
|
feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')]
|
@ -7,18 +7,64 @@ class Dzieje(BasicNewsRecipe):
|
|||||||
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
|
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
|
||||||
category = 'history'
|
category = 'history'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
index='http://dzieje.pl'
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
index = 'http://dzieje.pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
remove_javascript=True
|
remove_javascript=True
|
||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')]
|
keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')]
|
||||||
remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')]
|
remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')]
|
||||||
feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
|
#feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
|
||||||
|
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
tag = appendtag.find('li', attrs={'class':'pager-next'})
|
||||||
|
if tag:
|
||||||
|
while tag:
|
||||||
|
url = tag.a['href']
|
||||||
|
if not url.startswith('http'):
|
||||||
|
url = 'http://dzieje.pl'+tag.a['href']
|
||||||
|
soup2 = self.index_to_soup(url)
|
||||||
|
pagetext = soup2.find(id='content-area').find(attrs={'class':'content'})
|
||||||
|
for r in pagetext.findAll(attrs={'class':['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}):
|
||||||
|
r.extract()
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
tag = soup2.find('li', attrs={'class':'pager-next'})
|
||||||
|
for r in appendtag.findAll(attrs={'class':['item-list', 'field field-type-computed field-field-tagi', ]}):
|
||||||
|
r.extract()
|
||||||
|
|
||||||
|
def find_articles(self, url):
|
||||||
|
articles = []
|
||||||
|
soup=self.index_to_soup(url)
|
||||||
|
tag=soup.find(id='content-area').div.div
|
||||||
|
for i in tag.findAll('div', recursive=False):
|
||||||
|
temp = i.find(attrs={'class':'views-field-title'}).span.a
|
||||||
|
title = temp.string
|
||||||
|
url = self.index + temp['href']
|
||||||
|
date = '' #i.find(attrs={'class':'views-field-created'}).span.string
|
||||||
|
articles.append({'title' : title,
|
||||||
|
'url' : url,
|
||||||
|
'date' : date,
|
||||||
|
'description' : ''
|
||||||
|
})
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
feeds.append((u"Wiadomości", self.find_articles('http://dzieje.pl/wiadomosci')))
|
||||||
|
feeds.append((u"Kultura i sztuka", self.find_articles('http://dzieje.pl/kulturaisztuka')))
|
||||||
|
feeds.append((u"Film", self.find_articles('http://dzieje.pl/kino')))
|
||||||
|
feeds.append((u"Rozmaitości historyczne", self.find_articles('http://dzieje.pl/rozmaitości')))
|
||||||
|
feeds.append((u"Książka", self.find_articles('http://dzieje.pl/ksiazka')))
|
||||||
|
feeds.append((u"Wystawa", self.find_articles('http://dzieje.pl/wystawa')))
|
||||||
|
feeds.append((u"Edukacja", self.find_articles('http://dzieje.pl/edukacja')))
|
||||||
|
feeds.append((u"Dzieje się", self.find_articles('http://dzieje.pl/wydarzenia')))
|
||||||
|
return feeds
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href']=self.index + a['href']
|
a['href']=self.index + a['href']
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
return soup
|
return soup
|
@ -70,18 +70,6 @@ class Economist(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
soup = self.index_to_soup('http://www.economist.com/printedition/covers')
|
|
||||||
div = soup.find('div', attrs={'class':lambda x: x and
|
|
||||||
'print-cover-links' in x})
|
|
||||||
a = div.find('a', href=True)
|
|
||||||
url = a.get('href')
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'http://www.economist.com' + url
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
div = soup.find('div', attrs={'class':'cover-content'})
|
|
||||||
img = div.find('img', src=True)
|
|
||||||
return img.get('src')
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
return self.economist_parse_index()
|
return self.economist_parse_index()
|
||||||
@ -92,7 +80,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
if div is not None:
|
if div is not None:
|
||||||
img = div.find('img', src=True)
|
img = div.find('img', src=True)
|
||||||
if img is not None:
|
if img is not None:
|
||||||
self.cover_url = img['src']
|
self.cover_url = re.sub('thumbnail','full',img['src'])
|
||||||
feeds = OrderedDict()
|
feeds = OrderedDict()
|
||||||
for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
|
for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
|
||||||
x}):
|
x}):
|
||||||
|
@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
import time, re
|
import re
|
||||||
|
|
||||||
class Economist(BasicNewsRecipe):
|
class Economist(BasicNewsRecipe):
|
||||||
|
|
||||||
@ -37,7 +37,6 @@ class Economist(BasicNewsRecipe):
|
|||||||
padding: 7px 0px 9px;
|
padding: 7px 0px 9px;
|
||||||
}
|
}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
oldest_article = 7.0
|
oldest_article = 7.0
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
||||||
@ -46,7 +45,6 @@ class Economist(BasicNewsRecipe):
|
|||||||
{'class': lambda x: x and 'share-links-header' in x},
|
{'class': lambda x: x and 'share-links-header' in x},
|
||||||
]
|
]
|
||||||
keep_only_tags = [dict(id='ec-article-body')]
|
keep_only_tags = [dict(id='ec-article-body')]
|
||||||
needs_subscription = False
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
|
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
|
||||||
lambda x:'</html>')]
|
lambda x:'</html>')]
|
||||||
@ -55,27 +53,25 @@ class Economist(BasicNewsRecipe):
|
|||||||
# downloaded with connection reset by peer (104) errors.
|
# downloaded with connection reset by peer (104) errors.
|
||||||
delay = 1
|
delay = 1
|
||||||
|
|
||||||
def get_cover_url(self):
|
needs_subscription = False
|
||||||
soup = self.index_to_soup('http://www.economist.com/printedition/covers')
|
'''
|
||||||
div = soup.find('div', attrs={'class':lambda x: x and
|
def get_browser(self):
|
||||||
'print-cover-links' in x})
|
br = BasicNewsRecipe.get_browser()
|
||||||
a = div.find('a', href=True)
|
if self.username and self.password:
|
||||||
url = a.get('href')
|
br.open('http://www.economist.com/user/login')
|
||||||
if url.startswith('/'):
|
br.select_form(nr=1)
|
||||||
url = 'http://www.economist.com' + url
|
br['name'] = self.username
|
||||||
soup = self.index_to_soup(url)
|
br['pass'] = self.password
|
||||||
div = soup.find('div', attrs={'class':'cover-content'})
|
res = br.submit()
|
||||||
img = div.find('img', src=True)
|
raw = res.read()
|
||||||
return img.get('src')
|
if '>Log out<' not in raw:
|
||||||
|
raise ValueError('Failed to login to economist.com. '
|
||||||
|
'Check your username and password.')
|
||||||
|
return br
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
try:
|
|
||||||
return self.economist_parse_index()
|
|
||||||
except:
|
|
||||||
raise
|
|
||||||
self.log.warn(
|
|
||||||
'Initial attempt to parse index failed, retrying in 30 seconds')
|
|
||||||
time.sleep(30)
|
|
||||||
return self.economist_parse_index()
|
return self.economist_parse_index()
|
||||||
|
|
||||||
def economist_parse_index(self):
|
def economist_parse_index(self):
|
||||||
@ -84,7 +80,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
if div is not None:
|
if div is not None:
|
||||||
img = div.find('img', src=True)
|
img = div.find('img', src=True)
|
||||||
if img is not None:
|
if img is not None:
|
||||||
self.cover_url = img['src']
|
self.cover_url = re.sub('thumbnail','full',img['src'])
|
||||||
feeds = OrderedDict()
|
feeds = OrderedDict()
|
||||||
for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
|
for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
|
||||||
x}):
|
x}):
|
||||||
@ -151,154 +147,3 @@ class Economist(BasicNewsRecipe):
|
|||||||
div.insert(2, img)
|
div.insert(2, img)
|
||||||
table.replaceWith(div)
|
table.replaceWith(div)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
'''
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
from calibre.utils.threadpool import ThreadPool, makeRequests
|
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
|
||||||
import time, string, re
|
|
||||||
from datetime import datetime
|
|
||||||
from lxml import html
|
|
||||||
|
|
||||||
class Economist(BasicNewsRecipe):
|
|
||||||
|
|
||||||
title = 'The Economist (RSS)'
|
|
||||||
language = 'en'
|
|
||||||
|
|
||||||
__author__ = "Kovid Goyal"
|
|
||||||
description = ('Global news and current affairs from a European'
|
|
||||||
' perspective. Best downloaded on Friday mornings (GMT).'
|
|
||||||
' Much slower than the print edition based version.')
|
|
||||||
extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
|
|
||||||
oldest_article = 7.0
|
|
||||||
cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
|
|
||||||
#cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
|
|
||||||
remove_tags = [
|
|
||||||
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
|
|
||||||
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
|
|
||||||
'share_inline_header', 'related-items']}),
|
|
||||||
{'class': lambda x: x and 'share-links-header' in x},
|
|
||||||
]
|
|
||||||
keep_only_tags = [dict(id='ec-article-body')]
|
|
||||||
no_stylesheets = True
|
|
||||||
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
|
|
||||||
lambda x:'</html>')]
|
|
||||||
|
|
||||||
def parse_index(self):
|
|
||||||
from calibre.web.feeds.feedparser import parse
|
|
||||||
if self.test:
|
|
||||||
self.oldest_article = 14.0
|
|
||||||
raw = self.index_to_soup(
|
|
||||||
'http://feeds.feedburner.com/economist/full_print_edition',
|
|
||||||
raw=True)
|
|
||||||
entries = parse(raw).entries
|
|
||||||
pool = ThreadPool(10)
|
|
||||||
self.feed_dict = {}
|
|
||||||
requests = []
|
|
||||||
for i, item in enumerate(entries):
|
|
||||||
title = item.get('title', _('Untitled article'))
|
|
||||||
published = item.date_parsed
|
|
||||||
if not published:
|
|
||||||
published = time.gmtime()
|
|
||||||
utctime = datetime(*published[:6])
|
|
||||||
delta = datetime.utcnow() - utctime
|
|
||||||
if delta.days*24*3600 + delta.seconds > 24*3600*self.oldest_article:
|
|
||||||
self.log.debug('Skipping article %s as it is too old.'%title)
|
|
||||||
continue
|
|
||||||
link = item.get('link', None)
|
|
||||||
description = item.get('description', '')
|
|
||||||
author = item.get('author', '')
|
|
||||||
|
|
||||||
requests.append([i, link, title, description, author, published])
|
|
||||||
if self.test:
|
|
||||||
requests = requests[:4]
|
|
||||||
requests = makeRequests(self.process_eco_feed_article, requests, self.eco_article_found,
|
|
||||||
self.eco_article_failed)
|
|
||||||
for r in requests: pool.putRequest(r)
|
|
||||||
pool.wait()
|
|
||||||
|
|
||||||
return self.eco_sort_sections([(t, a) for t, a in
|
|
||||||
self.feed_dict.items()])
|
|
||||||
|
|
||||||
def eco_sort_sections(self, feeds):
|
|
||||||
if not feeds:
|
|
||||||
raise ValueError('No new articles found')
|
|
||||||
order = {
|
|
||||||
'The World This Week': 1,
|
|
||||||
'Leaders': 2,
|
|
||||||
'Letters': 3,
|
|
||||||
'Briefing': 4,
|
|
||||||
'Business': 5,
|
|
||||||
'Finance And Economics': 6,
|
|
||||||
'Science & Technology': 7,
|
|
||||||
'Books & Arts': 8,
|
|
||||||
'International': 9,
|
|
||||||
'United States': 10,
|
|
||||||
'Asia': 11,
|
|
||||||
'Europe': 12,
|
|
||||||
'The Americas': 13,
|
|
||||||
'Middle East & Africa': 14,
|
|
||||||
'Britain': 15,
|
|
||||||
'Obituary': 16,
|
|
||||||
}
|
|
||||||
return sorted(feeds, cmp=lambda x,y:cmp(order.get(x[0], 100),
|
|
||||||
order.get(y[0], 100)))
|
|
||||||
|
|
||||||
def process_eco_feed_article(self, args):
|
|
||||||
from calibre import browser
|
|
||||||
i, url, title, description, author, published = args
|
|
||||||
br = browser()
|
|
||||||
ret = br.open(url)
|
|
||||||
raw = ret.read()
|
|
||||||
url = br.geturl().split('?')[0]+'/print'
|
|
||||||
root = html.fromstring(raw)
|
|
||||||
matches = root.xpath('//*[@class = "ec-article-info"]')
|
|
||||||
feedtitle = 'Miscellaneous'
|
|
||||||
if matches:
|
|
||||||
feedtitle = string.capwords(html.tostring(matches[-1], method='text',
|
|
||||||
encoding=unicode).split('|')[-1].strip())
|
|
||||||
return (i, feedtitle, url, title, description, author, published)
|
|
||||||
|
|
||||||
def eco_article_found(self, req, result):
|
|
||||||
from calibre.web.feeds import Article
|
|
||||||
i, feedtitle, link, title, description, author, published = result
|
|
||||||
self.log('Found print version for article:', title, 'in', feedtitle,
|
|
||||||
'at', link)
|
|
||||||
|
|
||||||
a = Article(i, title, link, author, description, published, '')
|
|
||||||
|
|
||||||
article = dict(title=a.title, description=a.text_summary,
|
|
||||||
date=time.strftime(self.timefmt, a.date), author=a.author, url=a.url)
|
|
||||||
if feedtitle not in self.feed_dict:
|
|
||||||
self.feed_dict[feedtitle] = []
|
|
||||||
self.feed_dict[feedtitle].append(article)
|
|
||||||
|
|
||||||
def eco_article_failed(self, req, tb):
|
|
||||||
self.log.error('Failed to download %s with error:'%req.args[0][2])
|
|
||||||
self.log.debug(tb)
|
|
||||||
|
|
||||||
def eco_find_image_tables(self, soup):
|
|
||||||
for x in soup.findAll('table', align=['right', 'center']):
|
|
||||||
if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
|
|
||||||
yield x
|
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
|
||||||
body = soup.find('body')
|
|
||||||
for name, val in body.attrs:
|
|
||||||
del body[name]
|
|
||||||
for table in list(self.eco_find_image_tables(soup)):
|
|
||||||
caption = table.find('font')
|
|
||||||
img = table.find('img')
|
|
||||||
div = Tag(soup, 'div')
|
|
||||||
div['style'] = 'text-align:left;font-size:70%'
|
|
||||||
ns = NavigableString(self.tag_to_string(caption))
|
|
||||||
div.insert(0, ns)
|
|
||||||
div.insert(1, Tag(soup, 'br'))
|
|
||||||
img.extract()
|
|
||||||
del img['width']
|
|
||||||
del img['height']
|
|
||||||
div.insert(2, img)
|
|
||||||
table.replaceWith(div)
|
|
||||||
return soup
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
24
recipes/ekologia_pl.recipe
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
class EkologiaPl(BasicNewsRecipe):
|
||||||
|
title = u'Ekologia.pl'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Portal ekologiczny - eko, ekologia, ochrona przyrody, ochrona środowiska, przyroda, środowisko online. Ekologia i ochrona środowiska. Ekologia dla dzieci.'
|
||||||
|
category = 'ecology'
|
||||||
|
language = 'pl'
|
||||||
|
cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png'
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
extra_css = '.title {font-size: 200%;}'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj']})]
|
||||||
|
|
||||||
|
feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
id = re.search(r',(?P<id>\d+)\.html', url).group('id')
|
||||||
|
return 'http://drukuj.ekologia.pl/artykul/' + id
|
118
recipes/el_diplo.recipe
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
# Copyright 2013 Tomás Di Domenico
|
||||||
|
#
|
||||||
|
# This is a news fetching recipe for the Calibre ebook software, for
|
||||||
|
# fetching the Cono Sur edition of Le Monde Diplomatique (www.eldiplo.org).
|
||||||
|
#
|
||||||
|
# This recipe is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This software is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this recipe. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import re
|
||||||
|
from contextlib import closing
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
from calibre.utils.magick import Image
|
||||||
|
|
||||||
|
class ElDiplo_Recipe(BasicNewsRecipe):
|
||||||
|
title = u'El Diplo'
|
||||||
|
__author__ = 'Tomas Di Domenico'
|
||||||
|
description = 'Publicacion mensual de Le Monde Diplomatique, edicion Argentina'
|
||||||
|
langauge = 'es_AR'
|
||||||
|
needs_subscription = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
def get_cover(self,url):
|
||||||
|
tmp_cover = PersistentTemporaryFile(suffix = ".jpg", prefix = "eldiplo_")
|
||||||
|
self.cover_url = tmp_cover.name
|
||||||
|
|
||||||
|
with closing(self.browser.open(url)) as r:
|
||||||
|
imgdata = r.read()
|
||||||
|
|
||||||
|
img = Image()
|
||||||
|
img.load(imgdata)
|
||||||
|
img.crop(img.size[0],img.size[1]/2,0,0)
|
||||||
|
|
||||||
|
img.save(tmp_cover.name)
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.eldiplo.org/index.php/login/-/do_login/index.html')
|
||||||
|
br.select_form(nr=3)
|
||||||
|
br['uName'] = self.username
|
||||||
|
br['uPassword'] = self.password
|
||||||
|
br.submit()
|
||||||
|
self.browser = br
|
||||||
|
return br
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
default_sect = 'General'
|
||||||
|
articles = {default_sect:[]}
|
||||||
|
ans = [default_sect]
|
||||||
|
sectionsmarker = 'DOSSIER_TITLE: '
|
||||||
|
sectionsre = re.compile('^'+sectionsmarker)
|
||||||
|
|
||||||
|
soup = self.index_to_soup('http://www.eldiplo.org/index.php')
|
||||||
|
|
||||||
|
coverdivs = soup.findAll(True,attrs={'id':['lmd-foto']})
|
||||||
|
a = coverdivs[0].find('a', href=True)
|
||||||
|
coverurl = a['href'].split("?imagen=")[1]
|
||||||
|
self.get_cover(coverurl)
|
||||||
|
|
||||||
|
thedivs = soup.findAll(True,attrs={'class':['lmd-leermas']})
|
||||||
|
for div in thedivs:
|
||||||
|
a = div.find('a', href=True)
|
||||||
|
if 'Sumario completo' in self.tag_to_string(a, use_alt=True):
|
||||||
|
summaryurl = re.sub(r'\?.*', '', a['href'])
|
||||||
|
summaryurl = 'http://www.eldiplo.org' + summaryurl
|
||||||
|
|
||||||
|
for pagenum in xrange(1,10):
|
||||||
|
soup = self.index_to_soup('{0}/?cms1_paging_p_b32={1}'.format(summaryurl,pagenum))
|
||||||
|
thedivs = soup.findAll(True,attrs={'class':['interna']})
|
||||||
|
|
||||||
|
if len(thedivs) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
for div in thedivs:
|
||||||
|
section = div.find(True,text=sectionsre).replace(sectionsmarker,'')
|
||||||
|
if section == '':
|
||||||
|
section = default_sect
|
||||||
|
|
||||||
|
if section not in articles.keys():
|
||||||
|
articles[section] = []
|
||||||
|
ans.append(section)
|
||||||
|
|
||||||
|
nota = div.find(True,attrs={'class':['lmd-pl-titulo-nota-dossier']})
|
||||||
|
a = nota.find('a', href=True)
|
||||||
|
if not a:
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
|
url = 'http://www.eldiplo.org' + url
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
|
||||||
|
summary = div.find(True, attrs={'class':'lmd-sumario-descript'}).find('p')
|
||||||
|
if summary:
|
||||||
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
|
||||||
|
aut = div.find(True, attrs={'class':'lmd-autor-sumario'})
|
||||||
|
if aut:
|
||||||
|
auth = self.tag_to_string(aut, use_alt=False).strip()
|
||||||
|
|
||||||
|
if not articles.has_key(section):
|
||||||
|
articles[section] = []
|
||||||
|
|
||||||
|
articles[section].append(dict(title=title,author=auth,url=url,date=None,description=description,content=''))
|
||||||
|
|
||||||
|
#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
||||||
|
ans = [(section, articles[section]) for section in ans if articles.has_key(section)]
|
||||||
|
return ans
|
@ -5,6 +5,7 @@ class AdvancedUserRecipe1341650280(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = u'Empire Magazine'
|
title = u'Empire Magazine'
|
||||||
description = 'Author D.Asbury. Film articles from Empire Mag. '
|
description = 'Author D.Asbury. Film articles from Empire Mag. '
|
||||||
|
language = 'en'
|
||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
# last updated 7/7/12
|
# last updated 7/7/12
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
19
recipes/film_org_pl.recipe
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
class FilmOrgPl(BasicNewsRecipe):
|
||||||
|
title = u'Film.org.pl'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce."
|
||||||
|
category = 'film'
|
||||||
|
language = 'pl'
|
||||||
|
cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png'
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
use_embedded_content = True
|
||||||
|
preprocess_regexps = [(re.compile(ur'<h3>Przeczytaj także:</h3>.*', re.IGNORECASE|re.DOTALL), lambda m: '</body>'), (re.compile(ur'<div>Artykuł</div>', re.IGNORECASE), lambda m: ''), (re.compile(ur'<div>Ludzie filmu</div>', re.IGNORECASE), lambda m: '')]
|
||||||
|
remove_tags = [dict(name='img', attrs={'alt':['Ludzie filmu', u'Artykuł']})]
|
||||||
|
feeds = [(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')]
|
@ -17,6 +17,7 @@ class FilmWebPl(BasicNewsRecipe):
|
|||||||
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
|
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
|
||||||
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
|
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
|
||||||
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
|
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
|
||||||
|
remove_attributes = ['style',]
|
||||||
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
|
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
|
||||||
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
|
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
|
||||||
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
|
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
|
||||||
@ -50,4 +51,9 @@ class FilmWebPl(BasicNewsRecipe):
|
|||||||
for i in soup.findAll('sup'):
|
for i in soup.findAll('sup'):
|
||||||
if not i.string or i.string.startswith('(kliknij'):
|
if not i.string or i.string.startswith('(kliknij'):
|
||||||
i.extract()
|
i.extract()
|
||||||
|
tag = soup.find(name='ul', attrs={'class':'inline sep-line'})
|
||||||
|
if tag:
|
||||||
|
tag.name = 'div'
|
||||||
|
for t in tag.findAll('li'):
|
||||||
|
t.name = 'div'
|
||||||
return soup
|
return soup
|
||||||
|
@ -18,7 +18,7 @@ class Fleshbot(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
use_embedded_content = True
|
use_embedded_content = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
masthead_url = 'http://cache.gawkerassets.com/assets/kotaku.com/img/logo.png'
|
masthead_url = 'http://fbassets.s3.amazonaws.com/images/uploads/2012/01/fleshbot-logo.png'
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif}
|
body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif}
|
||||||
img{margin-bottom: 1em}
|
img{margin-bottom: 1em}
|
||||||
@ -31,7 +31,7 @@ class Fleshbot(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
feeds = [(u'Articles', u'http://feeds.gawker.com/fleshbot/vip?format=xml')]
|
feeds = [(u'Articles', u'http://www.fleshbot.com/feed')]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
{'class': 'feedflare'},
|
{'class': 'feedflare'},
|
||||||
|
@ -11,21 +11,21 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
by Chen Wei weichen302@gmx.com, 2012-02-05'''
|
by Chen Wei weichen302@gmx.com, 2012-02-05'''
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'kwetal'
|
__author__ = 'Rick Shang, kwetal'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
version = 1.01
|
version = 1.01
|
||||||
|
|
||||||
title = u'Foreign Affairs (Subcription or (free) Registration)'
|
title = u'Foreign Affairs (Subcription)'
|
||||||
publisher = u'Council on Foreign Relations'
|
publisher = u'Council on Foreign Relations'
|
||||||
category = u'USA, Foreign Affairs'
|
category = u'USA, Foreign Affairs'
|
||||||
description = u'The leading forum for serious discussion of American foreign policy and international affairs.'
|
description = u'The leading forum for serious discussion of American foreign policy and international affairs.'
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
needs_subscription = True
|
||||||
|
|
||||||
INDEX = 'http://www.foreignaffairs.com'
|
INDEX = 'http://www.foreignaffairs.com'
|
||||||
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
|
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
|
||||||
INCLUDE_PREMIUM = False
|
|
||||||
|
|
||||||
|
|
||||||
remove_tags = []
|
remove_tags = []
|
||||||
@ -68,42 +68,56 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
|
||||||
answer = []
|
answer = []
|
||||||
soup = self.index_to_soup(self.FRONTPAGE)
|
soup = self.index_to_soup(self.FRONTPAGE)
|
||||||
sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
|
#get dates
|
||||||
|
date = re.split('\s\|\s',self.tag_to_string(soup.head.title.string))[0]
|
||||||
|
self.timefmt = u' [%s]'%date
|
||||||
|
|
||||||
|
sec_start = soup.findAll('div', attrs= {'class':'panel-pane'})
|
||||||
for sec in sec_start:
|
for sec in sec_start:
|
||||||
content = sec.nextSibling
|
|
||||||
if content:
|
|
||||||
section = self.tag_to_string(content.find('h2'))
|
|
||||||
articles = []
|
articles = []
|
||||||
|
section = self.tag_to_string(sec.find('h2'))
|
||||||
tags = []
|
if 'Books' in section:
|
||||||
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
|
reviewsection=sec.find('div', attrs = {'class': 'item-list'})
|
||||||
tags.append(div)
|
for subsection in reviewsection.findAll('div'):
|
||||||
for li in content.findAll('li'):
|
subsectiontitle=self.tag_to_string(subsection.span.a)
|
||||||
tags.append(li)
|
subsectionurl=self.INDEX + subsection.span.a['href']
|
||||||
|
soup1 = self.index_to_soup(subsectionurl)
|
||||||
for div in tags:
|
for div in soup1.findAll('div', attrs = {'class': 'views-field-title'}):
|
||||||
title = url = description = author = None
|
if div.find('a') is not None:
|
||||||
|
originalauthor=self.tag_to_string(div.findNext('div', attrs = {'class':'views-field-field-article-book-nid'}).div.a)
|
||||||
if self.INCLUDE_PREMIUM:
|
title=subsectiontitle+': '+self.tag_to_string(div.span.a)+' by '+originalauthor
|
||||||
found_premium = False
|
url=self.INDEX+div.span.a['href']
|
||||||
|
atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
|
||||||
|
if atr is not None:
|
||||||
|
author=self.tag_to_string(atr.span.a)
|
||||||
else:
|
else:
|
||||||
found_premium = div.findAll('span', attrs={'class':
|
author=''
|
||||||
'premium-icon'})
|
desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
||||||
if not found_premium:
|
if desc is not None:
|
||||||
tag = div.find('div', attrs={'class': 'views-field-title'})
|
description=self.tag_to_string(desc.div.p)
|
||||||
|
else:
|
||||||
if tag:
|
description=''
|
||||||
a = tag.find('a')
|
articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
|
||||||
if a:
|
subsectiontitle=''
|
||||||
title = self.tag_to_string(a)
|
else:
|
||||||
url = self.INDEX + a['href']
|
for div in sec.findAll('div', attrs = {'class': 'views-field-title'}):
|
||||||
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
|
if div.find('a') is not None:
|
||||||
tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
title=self.tag_to_string(div.span.a)
|
||||||
description = self.tag_to_string(tag_summary)
|
url=self.INDEX+div.span.a['href']
|
||||||
articles.append({'title':title, 'date':None, 'url':url,
|
atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
|
||||||
'description':description, 'author':author})
|
if atr is not None:
|
||||||
|
author=self.tag_to_string(atr.span.a)
|
||||||
|
else:
|
||||||
|
author=''
|
||||||
|
desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
||||||
|
if desc is not None:
|
||||||
|
description=self.tag_to_string(desc.div.p)
|
||||||
|
else:
|
||||||
|
description=''
|
||||||
|
articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
|
||||||
if articles:
|
if articles:
|
||||||
answer.append((section, articles))
|
answer.append((section, articles))
|
||||||
return answer
|
return answer
|
||||||
@ -115,15 +129,17 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
needs_subscription = True
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
br.open('https://www.foreignaffairs.com/user?destination=home')
|
br.open('https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
|
||||||
br.select_form(nr = 1)
|
br.select_form(nr = 1)
|
||||||
br['name'] = self.username
|
br['name'] = self.username
|
||||||
br['pass'] = self.password
|
br['pass'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
self.browser.open('http://www.foreignaffairs.com/logout?destination=user%3Fop=lo')
|
||||||
|
@ -4,9 +4,10 @@ import re
|
|||||||
class Gildia(BasicNewsRecipe):
|
class Gildia(BasicNewsRecipe):
|
||||||
title = u'Gildia.pl'
|
title = u'Gildia.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = 'Gildia - cultural site'
|
description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!'
|
||||||
cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg'
|
cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg'
|
||||||
category = 'culture'
|
category = 'culture'
|
||||||
|
cover_url = 'http://gildia.pl/images/logo-main.png'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -23,10 +24,13 @@ class Gildia(BasicNewsRecipe):
|
|||||||
content = soup.find('div', attrs={'class':'news'})
|
content = soup.find('div', attrs={'class':'news'})
|
||||||
if 'recenzj' in soup.title.string.lower():
|
if 'recenzj' in soup.title.string.lower():
|
||||||
for link in content.findAll(name='a'):
|
for link in content.findAll(name='a'):
|
||||||
if 'recenzj' in link['href']:
|
if 'recenzj' in link['href'] or 'muzyka/plyty' in link['href']:
|
||||||
self.log.warn('odnosnik')
|
|
||||||
self.log.warn(link['href'])
|
|
||||||
return self.index_to_soup(link['href'], raw=True)
|
return self.index_to_soup(link['href'], raw=True)
|
||||||
|
if 'fragmen' in soup.title.string.lower():
|
||||||
|
for link in content.findAll(name='a'):
|
||||||
|
if 'fragment' in link['href']:
|
||||||
|
return self.index_to_soup(link['href'], raw=True)
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
|
@ -1,19 +1,20 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
class Gram_pl(BasicNewsRecipe):
|
class Gram_pl(BasicNewsRecipe):
|
||||||
title = u'Gram.pl'
|
title = u'Gram.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = 'Gram.pl - site about computer games'
|
description = u'Serwis społecznościowy o grach: recenzje, newsy, zapowiedzi, encyklopedia gier, forum. Gry PC, PS3, X360, PS Vita, sprzęt dla graczy.'
|
||||||
category = 'games'
|
category = 'games'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
index='http://www.gram.pl'
|
index='http://www.gram.pl'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
||||||
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
||||||
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info', 'entry-footer clearfix']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button', 'entry-comment-counter', 'snap_nopreview sharing robots-nocontent', 'sharedaddy sd-sharing-enabled']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
|
keep_only_tags= [dict(id='articleModule')]
|
||||||
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']}), dict(name='article')]
|
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter']})]
|
||||||
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
||||||
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'),
|
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'),
|
||||||
(u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'),
|
(u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'),
|
||||||
@ -28,35 +29,21 @@ class Gram_pl(BasicNewsRecipe):
|
|||||||
feed.articles.remove(article)
|
feed.articles.remove(article)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def append_page(self, soup, appendtag):
|
|
||||||
nexturl = appendtag.find('a', attrs={'class':'cpn'})
|
|
||||||
while nexturl:
|
|
||||||
soup2 = self.index_to_soup('http://www.gram.pl'+ nexturl['href'])
|
|
||||||
r=appendtag.find(id='pgbox')
|
|
||||||
if r:
|
|
||||||
r.extract()
|
|
||||||
pagetext = soup2.find(attrs={'class':'main'})
|
|
||||||
r=pagetext.find('h1')
|
|
||||||
if r:
|
|
||||||
r.extract()
|
|
||||||
r=pagetext.find('h2')
|
|
||||||
if r:
|
|
||||||
r.extract()
|
|
||||||
for r in pagetext.findAll('script'):
|
|
||||||
r.extract()
|
|
||||||
pos = len(appendtag.contents)
|
|
||||||
appendtag.insert(pos, pagetext)
|
|
||||||
nexturl = appendtag.find('a', attrs={'class':'cpn'})
|
|
||||||
r=appendtag.find(id='pgbox')
|
|
||||||
if r:
|
|
||||||
r.extract()
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.append_page(soup, soup.body)
|
tag=soup.find(name='div', attrs={'class':'summary'})
|
||||||
tag=soup.findAll(name='div', attrs={'class':'picbox'})
|
if tag:
|
||||||
for t in tag:
|
tag.find(attrs={'class':'pros'}).insert(0, BeautifulSoup('<h2>Plusy:</h2>').h2)
|
||||||
t['style']='float: left;'
|
tag.find(attrs={'class':'cons'}).insert(0, BeautifulSoup('<h2>Minusy:</h2>').h2)
|
||||||
|
tag = soup.find(name='section', attrs={'class':'cenzurka'})
|
||||||
|
if tag:
|
||||||
|
rate = tag.p.img['data-ocena']
|
||||||
|
tag.p.img.extract()
|
||||||
|
tag.p.insert(len(tag.p.contents)-2, BeautifulSoup('<h2>Ocena: {0}</h2>'.format(rate)).h2)
|
||||||
for a in soup('a'):
|
for a in soup('a'):
|
||||||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
|
||||||
a['href']=self.index + a['href']
|
a['href']=self.index + a['href']
|
||||||
|
tag=soup.find(name='span', attrs={'class':'platforma'})
|
||||||
|
if tag:
|
||||||
|
tag.name = 'p'
|
||||||
return soup
|
return soup
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
harpers.org
|
harpers.org
|
||||||
'''
|
'''
|
||||||
@ -16,6 +16,7 @@ class Harpers(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
masthead_url = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif'
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -31,27 +32,9 @@ class Harpers(BasicNewsRecipe):
|
|||||||
.caption{font-family:Verdana,sans-serif;font-size:x-small;color:#666666;}
|
.caption{font-family:Verdana,sans-serif;font-size:x-small;color:#666666;}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
|
keep_only_tags = [ dict(name='div', attrs={'class':['postdetailFull', 'articlePost']}) ]
|
||||||
remove_tags = [
|
remove_tags = [dict(name=['link','object','embed','meta','base'])]
|
||||||
dict(name='table', attrs={'class':['rcnt','rcnt topline']})
|
|
||||||
,dict(name=['link','object','embed','meta','base'])
|
|
||||||
]
|
|
||||||
remove_attributes = ['width','height']
|
remove_attributes = ['width','height']
|
||||||
|
|
||||||
feeds = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')]
|
feeds = [(u"Harper's Magazine", u'http://harpers.org/feed/')]
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
cover_url = None
|
|
||||||
index = 'http://harpers.org/'
|
|
||||||
soup = self.index_to_soup(index)
|
|
||||||
link_item = soup.find(name = 'img',attrs= {'class':"cover"})
|
|
||||||
if link_item:
|
|
||||||
cover_url = 'http://harpers.org' + link_item['src']
|
|
||||||
return cover_url
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
for item in soup.findAll(xmlns=True):
|
|
||||||
del item['xmlns']
|
|
||||||
return soup
|
|
||||||
|
@ -1,18 +1,22 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
harpers.org - paid subscription/ printed issue articles
|
harpers.org - paid subscription/ printed issue articles
|
||||||
This recipe only get's article's published in text format
|
This recipe only get's article's published in text format
|
||||||
images and pdf's are ignored
|
images and pdf's are ignored
|
||||||
|
If you have institutional subscription based on access IP you do not need to enter
|
||||||
|
anything in username/password fields
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import time, re
|
||||||
|
import urllib
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Harpers_full(BasicNewsRecipe):
|
class Harpers_full(BasicNewsRecipe):
|
||||||
title = "Harper's Magazine - articles from printed edition"
|
title = "Harper's Magazine - articles from printed edition"
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = "Harper's Magazine: Founded June 1850."
|
description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index."
|
||||||
publisher = "Harpers's"
|
publisher = "Harpers's"
|
||||||
category = 'news, politics, USA'
|
category = 'news, politics, USA'
|
||||||
oldest_article = 30
|
oldest_article = 30
|
||||||
@ -21,13 +25,16 @@ class Harpers_full(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
delay = 1
|
delay = 1
|
||||||
language = 'en'
|
language = 'en'
|
||||||
needs_subscription = True
|
encoding = 'utf8'
|
||||||
masthead_url = 'http://www.harpers.org/media/image/Harpers_305x100.gif'
|
needs_subscription = 'optional'
|
||||||
|
masthead_url = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
INDEX = strftime('http://www.harpers.org/archive/%Y/%m')
|
LOGIN = 'http://harpers.org/wp-content/themes/harpers/ajax_login.php'
|
||||||
LOGIN = 'http://www.harpers.org'
|
extra_css = """
|
||||||
cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif')
|
body{font-family: adobe-caslon-pro,serif}
|
||||||
extra_css = ' body{font-family: "Georgia",serif} '
|
.category{font-size: small}
|
||||||
|
.articlePost p:first-letter{display: inline; font-size: xx-large; font-weight: bold}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -36,32 +43,53 @@ class Harpers_full(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
|
keep_only_tags = [ dict(name='div', attrs={'class':['postdetailFull','articlePost']}) ]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='table', attrs={'class':['rcnt','rcnt topline']})
|
dict(name='div', attrs={'class':'fRight rightDivPad'})
|
||||||
,dict(name='link')
|
,dict(name=['link','meta','object','embed','iframe'])
|
||||||
]
|
]
|
||||||
remove_attributes=['xmlns']
|
remove_attributes=['xmlns']
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
br.open('http://harpers.org/')
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
br.open(self.LOGIN)
|
tt = time.localtime()*1000
|
||||||
br.select_form(nr=1)
|
data = urllib.urlencode({ 'm':self.username
|
||||||
br['handle' ] = self.username
|
,'p':self.password
|
||||||
br['password'] = self.password
|
,'rt':'http://harpers.org/'
|
||||||
br.submit()
|
,'tt':tt
|
||||||
|
})
|
||||||
|
br.open(self.LOGIN, data)
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
#find current issue
|
||||||
|
soup = self.index_to_soup('http://harpers.org/')
|
||||||
|
currentIssue=soup.find('div',attrs={'class':'mainNavi'}).find('li',attrs={'class':'curentIssue'})
|
||||||
|
currentIssue_url=self.tag_to_string(currentIssue.a['href'])
|
||||||
|
self.log(currentIssue_url)
|
||||||
|
|
||||||
|
#go to the current issue
|
||||||
|
soup1 = self.index_to_soup(currentIssue_url)
|
||||||
|
date = re.split('\s\|\s',self.tag_to_string(soup1.head.title.string))[0]
|
||||||
|
self.timefmt = u' [%s]'%date
|
||||||
|
|
||||||
|
#get cover
|
||||||
|
coverurl='http://harpers.org/wp-content/themes/harpers/ajax_microfiche.php?img=harpers-'+re.split('harpers.org/',currentIssue_url)[1]+'gif/0001.gif'
|
||||||
|
soup2 = self.index_to_soup(coverurl)
|
||||||
|
self.cover_url = self.tag_to_string(soup2.find('img')['src'])
|
||||||
|
self.log(self.cover_url)
|
||||||
articles = []
|
articles = []
|
||||||
print 'Processing ' + self.INDEX
|
count = 0
|
||||||
soup = self.index_to_soup(self.INDEX)
|
for item in soup1.findAll('div', attrs={'class':'articleData'}):
|
||||||
for item in soup.findAll('div', attrs={'class':'title'}):
|
text_links = item.findAll('h2')
|
||||||
text_link = item.parent.find('img',attrs={'alt':'Text'})
|
for text_link in text_links:
|
||||||
if text_link:
|
if count == 0:
|
||||||
url = self.LOGIN + item.a['href']
|
count = 1
|
||||||
title = item.a.contents[0]
|
else:
|
||||||
|
url = text_link.a['href']
|
||||||
|
title = text_link.a.contents[0]
|
||||||
date = strftime(' %B %Y')
|
date = strftime(' %B %Y')
|
||||||
articles.append({
|
articles.append({
|
||||||
'title' :title
|
'title' :title
|
||||||
@ -69,4 +97,14 @@ class Harpers_full(BasicNewsRecipe):
|
|||||||
,'url' :url
|
,'url' :url
|
||||||
,'description':''
|
,'description':''
|
||||||
})
|
})
|
||||||
return [(soup.head.title.string, articles)]
|
return [(soup1.head.title.string, articles)]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?single=1'
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
soup = self.index_to_soup('http://harpers.org/')
|
||||||
|
signouturl=self.tag_to_string(soup.find('li', attrs={'class':'subLogOut'}).findNext('li').a['href'])
|
||||||
|
self.log(signouturl)
|
||||||
|
self.browser.open(signouturl)
|
||||||
|
|
||||||
|
@ -15,23 +15,12 @@ class AdvancedUserRecipe(BasicNewsRecipe):
|
|||||||
timeout = 5
|
timeout = 5
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'mitte_news'}),
|
||||||
|
dict(name='h1', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'meldung_wrapper'})]
|
||||||
|
|
||||||
remove_tags_after = dict(name ='p', attrs={'class':'editor'})
|
|
||||||
remove_tags = [dict(id='navi_top_container'),
|
remove_tags = [dict(id='navi_top_container'),
|
||||||
dict(id='navi_bottom'),
|
dict(name='p', attrs={'class':'size80'})]
|
||||||
dict(id='mitte_rechts'),
|
|
||||||
dict(id='navigation'),
|
|
||||||
dict(id='subnavi'),
|
|
||||||
dict(id='social_bookmarks'),
|
|
||||||
dict(id='permalink'),
|
|
||||||
dict(id='content_foren'),
|
|
||||||
dict(id='seiten_navi'),
|
|
||||||
dict(id='adbottom'),
|
|
||||||
dict(id='sitemap'),
|
|
||||||
dict(name='div', attrs={'id':'sitemap'}),
|
|
||||||
dict(name='ul', attrs={'class':'erste_zeile'}),
|
|
||||||
dict(name='ul', attrs={'class':'zweite_zeile'}),
|
|
||||||
dict(name='div', attrs={'class':'navi_top_container'})]
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
|
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
|
||||||
@ -54,5 +43,3 @@ class AdvancedUserRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url + '?view=print'
|
return url + '?view=print'
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,10 +16,14 @@ class TheHindu(BasicNewsRecipe):
|
|||||||
|
|
||||||
keep_only_tags = [dict(id='content')]
|
keep_only_tags = [dict(id='content')]
|
||||||
remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
|
remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
|
||||||
dict(id=['email-section', 'right-column', 'printfooter'])]
|
dict(id=['email-section', 'right-column', 'printfooter', 'topover',
|
||||||
|
'slidebox', 'th_footer'])]
|
||||||
|
|
||||||
extra_css = '.photo-caption { font-size: smaller }'
|
extra_css = '.photo-caption { font-size: smaller }'
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw, url):
|
||||||
|
return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
|
||||||
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
def postprocess_html(self, soup, first_fetch):
|
||||||
for t in soup.findAll(['table', 'tr', 'td','center']):
|
for t in soup.findAll(['table', 'tr', 'td','center']):
|
||||||
t.name = 'div'
|
t.name = 'div'
|
||||||
|
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class Historia_org_pl(BasicNewsRecipe):
|
class Historia_org_pl(BasicNewsRecipe):
|
||||||
title = u'Historia.org.pl'
|
title = u'Historia.org.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'history site'
|
description = u'Artykuły dotyczące historii w układzie epok i tematów, forum. Najlepsza strona historii. Matura z historii i egzamin gimnazjalny z historii.'
|
||||||
cover_url = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg'
|
cover_url = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg'
|
||||||
category = 'history'
|
category = 'history'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
@ -12,16 +12,15 @@ class Historia_org_pl(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = True
|
use_embedded_content = True
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
|
||||||
feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=atom'),
|
|
||||||
(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=atom'),
|
feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'),
|
||||||
(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=atom'),
|
(u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'),
|
||||||
(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=atom'),
|
(u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'),
|
||||||
(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=atom'),
|
(u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'),
|
||||||
(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=atom'),
|
(u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'),
|
||||||
(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=atom'),
|
(u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),]
|
||||||
(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=atom'),
|
|
||||||
(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=atom')]
|
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
|
BIN
recipes/icons/astroflesz.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
BIN
recipes/icons/czas_gentlemanow.png
Normal file
After Width: | Height: | Size: 24 KiB |
BIN
recipes/icons/ekologia_pl.png
Normal file
After Width: | Height: | Size: 702 B |
BIN
recipes/icons/libartes.png
Normal file
After Width: | Height: | Size: 282 B |
BIN
recipes/icons/poradnia_pwn.png
Normal file
After Width: | Height: | Size: 350 B |
BIN
recipes/icons/tvp_info.png
Normal file
After Width: | Height: | Size: 329 B |
BIN
recipes/icons/zaufana_trzecia_strona.png
Normal file
After Width: | Height: | Size: 412 B |
@ -28,12 +28,15 @@ class IlMessaggero(BasicNewsRecipe):
|
|||||||
recursion = 10
|
recursion = 10
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
extra_css = ' .bianco31lucida{color: black} '
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='h1', attrs={'class':['titoloLettura2','titoloart','bianco31lucida']}),
|
||||||
keep_only_tags = [dict(name='h1', attrs={'class':'titoloLettura2'}),
|
dict(name='h2', attrs={'class':['sottotitLettura','grigio16']}),
|
||||||
dict(name='h2', attrs={'class':'sottotitLettura'}),
|
dict(name='span', attrs={'class':'testoArticoloG'}),
|
||||||
dict(name='span', attrs={'class':'testoArticoloG'})
|
dict(name='div', attrs={'id':'testodim'})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover = None
|
cover = None
|
||||||
st = time.localtime()
|
st = time.localtime()
|
||||||
@ -55,17 +58,16 @@ class IlMessaggero(BasicNewsRecipe):
|
|||||||
feeds = [
|
feeds = [
|
||||||
(u'HomePage', u'http://www.ilmessaggero.it/rss/home.xml'),
|
(u'HomePage', u'http://www.ilmessaggero.it/rss/home.xml'),
|
||||||
(u'Primo Piano', u'http://www.ilmessaggero.it/rss/initalia_primopiano.xml'),
|
(u'Primo Piano', u'http://www.ilmessaggero.it/rss/initalia_primopiano.xml'),
|
||||||
(u'Cronaca Bianca', u'http://www.ilmessaggero.it/rss/initalia_cronacabianca.xml'),
|
|
||||||
(u'Cronaca Nera', u'http://www.ilmessaggero.it/rss/initalia_cronacanera.xml'),
|
|
||||||
(u'Economia e Finanza', u'http://www.ilmessaggero.it/rss/economia.xml'),
|
(u'Economia e Finanza', u'http://www.ilmessaggero.it/rss/economia.xml'),
|
||||||
(u'Politica', u'http://www.ilmessaggero.it/rss/initalia_politica.xml'),
|
(u'Politica', u'http://www.ilmessaggero.it/rss/initalia_politica.xml'),
|
||||||
(u'Scienza e Tecnologia', u'http://www.ilmessaggero.it/rss/scienza.xml'),
|
(u'Cultura', u'http://www.ilmessaggero.it/rss/cultura.xml'),
|
||||||
(u'Cinema', u'http://www.ilmessaggero.it/rss.php?refresh_ce#'),
|
(u'Tecnologia', u'http://www.ilmessaggero.it/rss/tecnologia.xml'),
|
||||||
(u'Viaggi', u'http://www.ilmessaggero.it/rss.php?refresh_ce#'),
|
(u'Spettacoli', u'http://www.ilmessaggero.it/rss/spettacoli.xml'),
|
||||||
|
(u'Edizioni Locali', u'http://www.ilmessaggero.it/rss/edlocali.xml'),
|
||||||
(u'Roma', u'http://www.ilmessaggero.it/rss/roma.xml'),
|
(u'Roma', u'http://www.ilmessaggero.it/rss/roma.xml'),
|
||||||
(u'Cultura e Tendenze', u'http://www.ilmessaggero.it/rss/roma_culturaspet.xml'),
|
(u'Benessere', u'http://www.ilmessaggero.it/rss/benessere.xml'),
|
||||||
(u'Sport', u'http://www.ilmessaggero.it/rss/sport.xml'),
|
(u'Sport', u'http://www.ilmessaggero.it/rss/sport.xml'),
|
||||||
(u'Calcio', u'http://www.ilmessaggero.it/rss/sport_calcio.xml'),
|
(u'Moda', u'http://www.ilmessaggero.it/rss/moda.xml')
|
||||||
(u'Motori', u'http://www.ilmessaggero.it/rss/sport_motori.xml')
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -47,9 +47,10 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
|
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
|
||||||
dict(name='img',attrs={'alt' : ['view gallery']}),
|
dict(name='img',attrs={'alt' : ['view gallery']}),
|
||||||
dict(attrs={'style' : re.compile('.*')}),
|
dict(attrs={'style' : re.compile('.*')}),
|
||||||
|
dict(attrs={'class':lambda x: x and 'voicesRelatedTopics' in x.split()}),
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags =[dict(attrs={'id':'main'})]
|
keep_only_tags =[dict(attrs={'id':['main','top']})]
|
||||||
recursions = 0
|
recursions = 0
|
||||||
|
|
||||||
# fixes non compliant html nesting and 'marks' article graphics links
|
# fixes non compliant html nesting and 'marks' article graphics links
|
||||||
@ -69,7 +70,7 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
}
|
}
|
||||||
|
|
||||||
extra_css = """
|
extra_css = """
|
||||||
h1{font-family: Georgia,serif }
|
h1{font-family: Georgia,serif ; font-size: x-large; }
|
||||||
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
||||||
img{margin-bottom: 0.4em; display:block}
|
img{margin-bottom: 0.4em; display:block}
|
||||||
.starRating img {float: left}
|
.starRating img {float: left}
|
||||||
@ -77,16 +78,21 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
.image {clear:left; font-size: x-small; color:#888888;}
|
.image {clear:left; font-size: x-small; color:#888888;}
|
||||||
.articleByTimeLocation {font-size: x-small; color:#888888;
|
.articleByTimeLocation {font-size: x-small; color:#888888;
|
||||||
margin-bottom:0.2em ; margin-top:0.2em ; display:block}
|
margin-bottom:0.2em ; margin-top:0.2em ; display:block}
|
||||||
.subtitle {clear:left}
|
.subtitle {clear:left ;}
|
||||||
.column-1 h1 { color: #191919}
|
.column-1 h1 { color: #191919}
|
||||||
.column-1 h2 { color: #333333}
|
.column-1 h2 { color: #333333}
|
||||||
.column-1 h3 { color: #444444}
|
.column-1 h3 { color: #444444}
|
||||||
.column-1 p { color: #777777}
|
.subtitle { color: #777777; font-size: medium;}
|
||||||
.column-1 p,a,h1,h2,h3 { margin: 0; }
|
.column-1 a,h1,h2,h3 { margin: 0; }
|
||||||
.column-1 div{color:#888888; margin: 0;}
|
.column-1 div{margin: 0;}
|
||||||
.articleContent {display: block; clear:left;}
|
.articleContent {display: block; clear:left;}
|
||||||
|
.articleContent {color: #000000; font-size: medium;}
|
||||||
|
.ivDrip-section {color: #000000; font-size: medium;}
|
||||||
|
.datetime {color: #888888}
|
||||||
|
.title {font-weight:bold;}
|
||||||
.storyTop{}
|
.storyTop{}
|
||||||
.pictureContainer img { max-width: 400px; max-height: 400px;}
|
.pictureContainer img { max-width: 400px; max-height: 400px;}
|
||||||
|
.image img { max-width: 400px; max-height: 400px;}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
@ -325,6 +331,20 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
item.contents[0] = ''
|
item.contents[0] = ''
|
||||||
|
|
||||||
def postprocess_html(self,soup, first_fetch):
|
def postprocess_html(self,soup, first_fetch):
|
||||||
|
|
||||||
|
#mark subtitle parent as non-compliant nesting causes
|
||||||
|
# p's to be 'popped out' of the h3 tag they are nested in.
|
||||||
|
subtitle = soup.find('h3', attrs={'class' : 'subtitle'})
|
||||||
|
subtitle_div = None
|
||||||
|
if subtitle:
|
||||||
|
subtitle_div = subtitle.parent
|
||||||
|
if subtitle_div:
|
||||||
|
clazz = ''
|
||||||
|
if 'class' in subtitle_div:
|
||||||
|
clazz = subtitle_div['class'] + ' '
|
||||||
|
clazz = clazz + 'subtitle'
|
||||||
|
subtitle_div['class'] = clazz
|
||||||
|
|
||||||
#find broken images and remove captions
|
#find broken images and remove captions
|
||||||
items_to_extract = []
|
items_to_extract = []
|
||||||
for item in soup.findAll('div', attrs={'class' : 'image'}):
|
for item in soup.findAll('div', attrs={'class' : 'image'}):
|
||||||
@ -501,6 +521,9 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
),
|
),
|
||||||
(u'Opinion',
|
(u'Opinion',
|
||||||
u'http://www.independent.co.uk/opinion/?service=rss'),
|
u'http://www.independent.co.uk/opinion/?service=rss'),
|
||||||
|
(u'Voices',
|
||||||
|
u'http://www.independent.co.uk/voices/?service=rss'
|
||||||
|
),
|
||||||
(u'Environment',
|
(u'Environment',
|
||||||
u'http://www.independent.co.uk/environment/?service=rss'),
|
u'http://www.independent.co.uk/environment/?service=rss'),
|
||||||
(u'Sport - Athletics',
|
(u'Sport - Athletics',
|
||||||
|
@ -9,6 +9,21 @@ class Kosmonauta(BasicNewsRecipe):
|
|||||||
language = 'pl'
|
language = 'pl'
|
||||||
cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
|
cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
INDEX = 'http://www.kosmonauta.net'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
no_stylesheets = True
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/index.php/feed/rss.html')]
|
keep_only_tags = [dict(name='div', attrs={'class':'item-page'})]
|
||||||
|
remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']})]
|
||||||
|
remove_tags_after = dict(name='div', attrs={'class':'cedtag'})
|
||||||
|
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for a in soup.findAll(name='a'):
|
||||||
|
if a.has_key('href'):
|
||||||
|
href = a['href']
|
||||||
|
if not href.startswith('http'):
|
||||||
|
a['href'] = self.INDEX + href
|
||||||
|
print '%%%%%%%%%%%%%%%%%%%%%%%%%', a['href']
|
||||||
|
return soup
|
||||||
|
|
@ -1,15 +1,16 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
import re
|
||||||
class Ksiazka_net_pl(BasicNewsRecipe):
|
class Ksiazka_net_pl(BasicNewsRecipe):
|
||||||
title = u'ksiazka.net.pl'
|
title = u'książka.net.pl'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Ksiazka.net.pl - book vortal'
|
description = u'Portal Księgarski - tematyczny serwis o książkach. Wydarzenia z rynku księgarsko-wydawniczego, nowości, zapowiedzi, bestsellery, setki recenzji. Niezbędne informacje dla każdego miłośnika książek, księgarza, bibliotekarza i wydawcy.'
|
||||||
cover_url = 'http://www.ksiazka.net.pl/fileadmin/templates/ksiazka.net.pl/images/1PortalKsiegarski-logo.jpg'
|
cover_url = 'http://www.ksiazka.net.pl/fileadmin/templates/ksiazka.net.pl/images/1PortalKsiegarski-logo.jpg'
|
||||||
category = 'books'
|
category = 'books'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
oldest_article = 8
|
oldest_article = 8
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets= True
|
no_stylesheets= True
|
||||||
|
remove_empty_feeds = True
|
||||||
#extra_css = 'img {float: right;}'
|
#extra_css = 'img {float: right;}'
|
||||||
preprocess_regexps = [(re.compile(ur'Podoba mi się, kupuję:'), lambda match: '<br />')]
|
preprocess_regexps = [(re.compile(ur'Podoba mi się, kupuję:'), lambda match: '<br />')]
|
||||||
remove_tags_before= dict(name='div', attrs={'class':'m-body'})
|
remove_tags_before= dict(name='div', attrs={'class':'m-body'})
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'Gabriele Marini, based on Darko Miletic'
|
__author__ = 'Gabriele Marini, based on Darko Miletic'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
__description__ = 'La Stampa 05/05/2010'
|
__description__ = 'La Stampa 28/12/2012'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
http://www.lastampa.it/
|
http://www.lastampa.it/
|
||||||
@ -14,10 +14,11 @@ class LaStampa(BasicNewsRecipe):
|
|||||||
title = u'La Stampa'
|
title = u'La Stampa'
|
||||||
language = 'it'
|
language = 'it'
|
||||||
__author__ = 'Gabriele Marini'
|
__author__ = 'Gabriele Marini'
|
||||||
oldest_article = 15
|
#oldest_article = 15
|
||||||
|
oldest_articlce = 7 #for daily schedule
|
||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 50
|
||||||
recursion = 100
|
recursion = 100
|
||||||
cover_url = 'http://www.lastampa.it/edicola/PDF/1.pdf'
|
cover_url = 'http://www1.lastampa.it/edicola/PDF/1.pdf'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -33,35 +34,41 @@ class LaStampa(BasicNewsRecipe):
|
|||||||
if link:
|
if link:
|
||||||
return link[0]['href']
|
return link[0]['href']
|
||||||
|
|
||||||
keep_only_tags = [dict(attrs={'class':['boxocchiello2','titoloRub','titologir','catenaccio','sezione','articologirata']}),
|
keep_only_tags = [dict(attrs={'class':['boxocchiello2','titoloRub','titologir','autore-girata','luogo-girata','catenaccio','sezione','articologirata','bodytext','news-single-img','ls-articoloCorpo','ls-blog-list-1col']}),
|
||||||
dict(name='div', attrs={'id':'corpoarticolo'})
|
dict(name='div', attrs={'id':'corpoarticolo'})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'id':'menutop'}),
|
|
||||||
dict(name='div', attrs={'id':'fwnetblocco'}),
|
remove_tags = [dict(name='div', attrs={'id':['menutop','fwnetblocco']}),
|
||||||
dict(name='table', attrs={'id':'strumenti'}),
|
dict(attrs={'class':['ls-toolbarCommenti','ls-boxCommentsBlog']}),
|
||||||
dict(name='table', attrs={'id':'imgesterna'}),
|
dict(name='table', attrs={'id':['strumenti','imgesterna']}),
|
||||||
dict(name='a', attrs={'class':'linkblu'}),
|
dict(name='a', attrs={'class':['linkblu','link']}),
|
||||||
dict(name='a', attrs={'class':'link'}),
|
|
||||||
dict(name='span', attrs={'class':['boxocchiello','boxocchiello2','sezione']})
|
dict(name='span', attrs={'class':['boxocchiello','boxocchiello2','sezione']})
|
||||||
]
|
]
|
||||||
|
feeds = [(u'BuonGiorno',u'http://www.lastampa.it/cultura/opinioni/buongiorno/rss.xml'),
|
||||||
feeds = [
|
(u'Jena', u'http://www.lastampa.it/cultura/opinioni/jena/rss.xml'),
|
||||||
(u'Home', u'http://www.lastampa.it/redazione/rss_home.xml'),
|
(u'Editoriali', u'http://www.lastampa.it/cultura/opinioni/editoriali'),
|
||||||
(u'Editoriali', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=25'),
|
(u'Finestra sull America', u'http://lastampa.feedsportal.com/c/32418/f/625713/index.rss'),
|
||||||
(u'Politica', u'http://www.lastampa.it/redazione/cmssezioni/politica/rss_politica.xml'),
|
(u'HomePage', u'http://www.lastampa.it/rss.xml'),
|
||||||
(u'ArciItaliana', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=14'),
|
(u'Politica Italia', u'http://www.lastampa.it/italia/politica/rss.xml'),
|
||||||
(u'Cronache', u'http://www.lastampa.it/redazione/cmssezioni/cronache/rss_cronache.xml'),
|
(u'ArciItaliana', u'http://www.lastampa.it/rss/blog/arcitaliana'),
|
||||||
(u'Esteri', u'http://www.lastampa.it/redazione/cmssezioni/esteri/rss_esteri.xml'),
|
(u'Cronache', u'http://www.lastampa.it/italia/cronache/rss.xml'),
|
||||||
(u'Danni Collaterali', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=90'),
|
(u'Esteri', u'http://www.lastampa.it/esteri/rss.xml'),
|
||||||
(u'Economia', u'http://www.lastampa.it/redazione/cmssezioni/economia/rss_economia.xml'),
|
(u'Danni Collaterali', u'http://www.lastampa.it/rss/blog/danni-collaterali'),
|
||||||
(u'Tecnologia ', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=30'),
|
(u'Economia', u'http://www.lastampa.it/economia/rss.xml'),
|
||||||
(u'Spettacoli', u'http://www.lastampa.it/redazione/cmssezioni/spettacoli/rss_spettacoli.xml'),
|
(u'Tecnologia ', u'http://www.lastampa.it/tecnologia/rss.xml'),
|
||||||
(u'Sport', u'http://www.lastampa.it/sport/rss_home.xml'),
|
(u'Spettacoli', u'http://www.lastampa.it/spettacoli/rss.xml'),
|
||||||
(u'Torino', u'http://rss.feedsportal.com/c/32418/f/466938/index.rss'),
|
(u'Sport', u'http://www.lastampa.it/sport/rss.xml'),
|
||||||
(u'Motori', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=57'),
|
(u'Torino', u'http://www.lastampa.it/cronaca/rss.xml'),
|
||||||
(u'Scienza', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=38'),
|
(u'Motori', u'http://www.lastampa.it/motori/rss.xml'),
|
||||||
(u'Fotografia', u'http://rss.feedsportal.com/c/32418/f/478449/index.rss'),
|
(u'Scienza', u'http://www.lastampa.it/scienza/rss.xml'),
|
||||||
(u'Scuola', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=60'),
|
(u'Cultura', u'http://www.lastampa.it/cultura/rss.xml'),
|
||||||
(u'Tempo Libero', u'http://www.lastampa.it/tempolibero/rss_home.xml')
|
(u'Scuola', u'http://www.lastampa.it/cultura/scuola/rss.xml'),
|
||||||
|
(u'Benessere', u'http://www.lastampa.it/scienza/benessere/rss.xml'),
|
||||||
|
(u'Cucina', u'http://www.lastampa.it/societa/cucina/rss.xml'),
|
||||||
|
(u'Casa', u'http://www.lastampa.it/societa/casa/rss.xml'),
|
||||||
|
(u'Moda',u'http://www.lastampa.it/societa/moda/rss.xml'),
|
||||||
|
(u'Giochi',u'http://www.lastampa.it/tecnologia/giochi/rss.xml'),
|
||||||
|
(u'Viaggi',u'http://www.lastampa.it/societa/viaggi/rss.xml'),
|
||||||
|
(u'Ambiente', u'http://www.lastampa.it/scienza/ambiente/rss.xml')
|
||||||
]
|
]
|
||||||
|
@ -7,9 +7,9 @@ class AdvancedUserRecipe1324114228(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
masthead_url = 'http://www.lavoce.info/binary/la_voce/testata/lavoce.1184661635.gif'
|
masthead_url = 'http://www.lavoce.info/binary/la_voce/testata/lavoce.1184661635.gif'
|
||||||
feeds = [(u'La Voce', u'http://www.lavoce.info/feed_rss.php?id_feed=1')]
|
feeds = [(u'La Voce', u'http://www.lavoce.info/feed/')]
|
||||||
__author__ = 'faber1971'
|
__author__ = 'faber1971'
|
||||||
description = 'Italian website on Economy - v1.01 (17, December 2011)'
|
description = 'Italian website on Economy - v1.02 (27, December 2012)'
|
||||||
language = 'it'
|
language = 'it'
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,13 +22,15 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
#publication_type = 'newsportal'
|
#publication_type = 'newsportal'
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1{font-size:130%;}
|
h1{font-size:130%;}
|
||||||
|
h2{font-size:100%;}
|
||||||
|
blockquote.aside {background-color: #DDD; padding: 0.5em;}
|
||||||
.ariane{font-size:xx-small;}
|
.ariane{font-size:xx-small;}
|
||||||
.source{font-size:xx-small;}
|
.source{font-size:xx-small;}
|
||||||
#.href{font-size:xx-small;}
|
/*.href{font-size:xx-small;}*/
|
||||||
#.figcaption style{color:#666666; font-size:x-small;}
|
/*.figcaption style{color:#666666; font-size:x-small;}*/
|
||||||
#.main-article-info{font-family:Arial,Helvetica,sans-serif;}
|
/*.main-article-info{font-family:Arial,Helvetica,sans-serif;}*/
|
||||||
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
|
/*full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}*/
|
||||||
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
|
/*match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}*/
|
||||||
'''
|
'''
|
||||||
#preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
#preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
@ -44,6 +46,9 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
filterDuplicates = True
|
filterDuplicates = True
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
for aside in soup.findAll('aside'):
|
||||||
|
aside.name='blockquote'
|
||||||
|
aside['class'] = "aside"
|
||||||
for alink in soup.findAll('a'):
|
for alink in soup.findAll('a'):
|
||||||
if alink.string is not None:
|
if alink.string is not None:
|
||||||
tstr = alink.string
|
tstr = alink.string
|
||||||
@ -107,7 +112,9 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'class':['bloc_base meme_sujet']}),
|
dict(attrs={'class':['rubriques_liees']}),
|
||||||
|
dict(attrs={'class':['sociaux']}),
|
||||||
|
dict(attrs={'class':['bloc_base meme_sujet']}),
|
||||||
dict(name='p', attrs={'class':['lire']})
|
dict(name='p', attrs={'class':['lire']})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -32,26 +32,28 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
recursion = 10
|
recursion = 10
|
||||||
needs_subscription = 'optional'
|
needs_subscription = 'optional'
|
||||||
|
|
||||||
filterDuplicates = False
|
|
||||||
url_list = []
|
url_list = []
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(r'(title|alt)=".*?>.*?"', re.DOTALL), lambda m: '')]
|
preprocess_regexps = [(re.compile(r'(title|alt)=".*?>.*?"', re.DOTALL), lambda m: '')]
|
||||||
|
|
||||||
keep_only_tags = [
|
#keep_only_tags = [
|
||||||
dict(name='div', attrs={'id':'article'}),
|
#dict(name='div', attrs={'id':'article_detail'}),
|
||||||
dict(name='div', attrs={'id':'colonne_principale'})
|
#dict(name='div', attrs={'id':'colonne_principale'})
|
||||||
]
|
#]
|
||||||
|
|
||||||
remove_tags = [
|
#remove_tags = [
|
||||||
dict(name='div', attrs={'id':'dialog'}),
|
#dict(name='div', attrs={'id':'dialog'}),
|
||||||
dict(name='div', attrs={'class':['interesse_actions','reactions']}),
|
#dict(name='div', attrs={'class':['interesse_actions','reactions','taille_du_texte right clearfix','partage_sociaux clearfix']}),
|
||||||
dict(name='ul', attrs={'class':'mots_cles'}),
|
#dict(name='aside', attrs={'class':['article_actions clearfix','reactions','partage_sociaux_wrapper']}),
|
||||||
dict(name='a', attrs={'class':'haut'}),
|
#dict(name='ul', attrs={'class':'mots_cles'}),
|
||||||
dict(name='h5', attrs={'class':'interesse_actions'})
|
#dict(name='ul', attrs={'id':'commentaires'}),
|
||||||
]
|
#dict(name='a', attrs={'class':'haut'}),
|
||||||
|
#dict(name='h5', attrs={'class':'interesse_actions'})
|
||||||
|
#]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
|
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
|
||||||
@ -95,10 +97,4 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
if self.filterDuplicates:
|
|
||||||
if url in self.url_list:
|
|
||||||
return
|
|
||||||
self.url_list.append(url)
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
69
recipes/libartes.recipe
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
libartes.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Libartes(BasicNewsRecipe):
|
||||||
|
title = 'Libartes'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Elektronski časopis Libartes delo je kulturnih entuzijasta, umetnika i teoretičara umetnosti i književnosti. Časopis Libartes izlazi tromesečno i bavi se različitim granama umetnosti - književnošću, muzikom, filmom, likovnim umetnostima, dizajnom i arhitekturom.'
|
||||||
|
publisher = 'Libartes'
|
||||||
|
category = 'literatura, knjizevnost, film, dizajn, arhitektura, muzika'
|
||||||
|
no_stylesheets = True
|
||||||
|
INDEX = 'http://libartes.com/'
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf-8'
|
||||||
|
language = 'sr'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
masthead_url = 'http://libartes.com/index_files/logo.gif'
|
||||||
|
extra_css = """
|
||||||
|
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: "Times New Roman",Times,serif1, serif}
|
||||||
|
img{display:block}
|
||||||
|
.naslov{font-size: xx-large; font-weight: bold}
|
||||||
|
.nag{font-size: large; font-weight: bold}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
remove_tags_before = dict(attrs={'id':'nav'})
|
||||||
|
remove_tags_after = dict(attrs={'id':'fb' })
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'center_content'})]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link','iframe','embed','meta'])
|
||||||
|
,dict(attrs={'id':'nav'})
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
articles = []
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
for item in soup.findAll(name='a', attrs={'class':'belad'}, href=True):
|
||||||
|
feed_link = item
|
||||||
|
if feed_link['href'].startswith(self.INDEX):
|
||||||
|
url = feed_link['href']
|
||||||
|
else:
|
||||||
|
url = self.INDEX + feed_link['href']
|
||||||
|
|
||||||
|
title = self.tag_to_string(feed_link)
|
||||||
|
date = strftime(self.timefmt)
|
||||||
|
articles.append({
|
||||||
|
'title' :title
|
||||||
|
,'date' :date
|
||||||
|
,'url' :url
|
||||||
|
,'description':''
|
||||||
|
})
|
||||||
|
return [('Casopis Libartes', articles)]
|
||||||
|
|
@ -14,7 +14,8 @@ class LiberoNews(BasicNewsRecipe):
|
|||||||
__author__ = 'Marini Gabriele'
|
__author__ = 'Marini Gabriele'
|
||||||
description = 'Italian daily newspaper'
|
description = 'Italian daily newspaper'
|
||||||
|
|
||||||
cover_url = 'http://www.libero-news.it/images/logo.png'
|
#cover_url = 'http://www.liberoquotidiano.it/images/Libero%20Quotidiano.jpg'
|
||||||
|
cover_url = 'http://www.edicola.liberoquotidiano.it/vnlibero/fpcut.jsp?testata=milano'
|
||||||
title = u'Libero '
|
title = u'Libero '
|
||||||
publisher = 'EDITORIALE LIBERO s.r.l 2006'
|
publisher = 'EDITORIALE LIBERO s.r.l 2006'
|
||||||
category = 'News, politics, culture, economy, general interest'
|
category = 'News, politics, culture, economy, general interest'
|
||||||
@ -32,10 +33,11 @@ class LiberoNews(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'class':'Articolo'})
|
dict(name='div', attrs={'class':'Articolo'}),
|
||||||
|
dict(name='article')
|
||||||
]
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'class':['CommentaFoto','Priva2']}),
|
dict(name='div', attrs={'class':['CommentaFoto','Priva2','login_commenti','box_16']}),
|
||||||
dict(name='div', attrs={'id':['commentigenerale']})
|
dict(name='div', attrs={'id':['commentigenerale']})
|
||||||
]
|
]
|
||||||
feeds = [
|
feeds = [
|
||||||
|
12
recipes/lvivs_ks_ghazieta.recipe
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1356270446(BasicNewsRecipe):
|
||||||
|
title = u'\u041b\u044c\u0432\u0456\u0432\u0441\u044c\u043a\u0430 \u0433\u0430\u0437\u0435\u0442\u0430'
|
||||||
|
__author__ = 'rpalyvoda'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
language = 'uk'
|
||||||
|
cover_url = 'http://lvivska.com/sites/all/themes/biblos/images/logo.png'
|
||||||
|
masthead_url = 'http://lvivska.com/sites/all/themes/biblos/images/logo.png'
|
||||||
|
auto_cleanup = True
|
||||||
|
feeds = [(u'\u041d\u043e\u0432\u0438\u043d\u0438', u'http://lvivska.com/rss/news.xml'), (u'\u041f\u043e\u043b\u0456\u0442\u0438\u043a\u0430', u'http://lvivska.com/rss/politic.xml'), (u'\u0415\u043a\u043e\u043d\u043e\u043c\u0456\u043a\u0430', u'http://lvivska.com/rss/economic.xml'), (u'\u041f\u0440\u0430\u0432\u043e', u'http://lvivska.com/rss/law.xml'), (u'\u0421\u0432\u0456\u0442', u'http://lvivska.com/rss/world.xml'), (u'\u0416\u0438\u0442\u0442\u044f', u'http://lvivska.com/rss/life.xml'), (u'\u041a\u0443\u043b\u044c\u0442\u0443\u0440\u0430', u'http://lvivska.com/rss/culture.xml'), (u'\u041b\u0430\u0441\u0443\u043d', u'http://lvivska.com/rss/cooking.xml'), (u'\u0421\u0442\u0438\u043b\u044c', u'http://lvivska.com/rss/style.xml'), (u'Galicia Incognita', u'http://lvivska.com/rss/galiciaincognita.xml'), (u'\u0421\u043f\u043e\u0440\u0442', u'http://lvivska.com/rss/sport.xml'), (u'\u0415\u043a\u043e\u043b\u043e\u0433\u0456\u044f', u'http://lvivska.com/rss/ecology.xml'), (u"\u0417\u0434\u043e\u0440\u043e\u0432'\u044f", u'http://lvivska.com/rss/health.xml'), (u'\u0410\u0432\u0442\u043e', u'http://lvivska.com/rss/auto.xml'), (u'\u0411\u043b\u043e\u0433\u0438', u'http://lvivska.com/rss/blog.xml')]
|
@ -1,43 +1,74 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre import strftime
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
|
||||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||||
title = u'Metro UK'
|
title = u'Metro UK'
|
||||||
description = 'Author Dave Asbury : News from The Metro - UK'
|
description = 'News as provided by The Metro -UK'
|
||||||
#timefmt = ''
|
#timefmt = ''
|
||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
#last update 9/9/12
|
#last update 9/6/12
|
||||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
||||||
no_stylesheets = True
|
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 12
|
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
#auto_cleanup = True
|
auto_cleanup = True
|
||||||
encoding = 'UTF-8'
|
encoding = 'UTF-8'
|
||||||
cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/157897_117118184990145_840702264_n.jpg'
|
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
||||||
extra_css = '''
|
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:900;font-size:1.6em;}
|
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:1.2em;}
|
|
||||||
p{font-family:Arial,Helvetica,sans-serif;font-size:1.0em;}
|
|
||||||
body{font-family:Helvetica,Arial,sans-serif;font-size:1.0em;}
|
|
||||||
'''
|
|
||||||
keep_only_tags = [
|
|
||||||
#dict(name='h1'),
|
|
||||||
#dict(name='h2'),
|
|
||||||
#dict(name='div', attrs={'class' : ['row','article','img-cnt figure','clrd']})
|
|
||||||
#dict(name='h3'),
|
|
||||||
#dict(attrs={'class' : 'BText'}),
|
|
||||||
]
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div',attrs={'class' : 'art-fd fd-gr1-b clrd'}),
|
|
||||||
dict(name='span',attrs={'class' : 'share'}),
|
|
||||||
dict(name='li'),
|
|
||||||
dict(attrs={'class' : ['twitter-share-button','header-forms','hdr-lnks','close','art-rgt','fd-gr1-b clrd google-article','news m12 clrd clr-b p5t shareBtm','item-ds csl-3-img news','c-1of3 c-last','c-1of1','pd','item-ds csl-3-img sport']}),
|
|
||||||
dict(attrs={'id' : ['','sky-left','sky-right','ftr-nav','and-ftr','notificationList','logo','miniLogo','comments-news','metro_extras']})
|
|
||||||
]
|
|
||||||
remove_tags_before = dict(name='h1')
|
|
||||||
#remove_tags_after = dict(attrs={'id':['topic-buttons']})
|
|
||||||
|
|
||||||
feeds = [
|
def parse_index(self):
|
||||||
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
|
articles = {}
|
||||||
|
key = None
|
||||||
|
ans = []
|
||||||
|
feeds = [ ('UK', 'http://metro.co.uk/news/uk/'),
|
||||||
|
('World', 'http://metro.co.uk/news/world/'),
|
||||||
|
('Weird', 'http://metro.co.uk/news/weird/'),
|
||||||
|
('Money', 'http://metro.co.uk/news/money/'),
|
||||||
|
('Sport', 'http://metro.co.uk/sport/'),
|
||||||
|
('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/')
|
||||||
|
]
|
||||||
|
for key, feed in feeds:
|
||||||
|
soup = self.index_to_soup(feed)
|
||||||
|
articles[key] = []
|
||||||
|
ans.append(key)
|
||||||
|
|
||||||
|
today = datetime.date.today()
|
||||||
|
today = time.mktime(today.timetuple())-60*60*24
|
||||||
|
|
||||||
|
for a in soup.findAll('a'):
|
||||||
|
for name, value in a.attrs:
|
||||||
|
if name == "class" and value=="post":
|
||||||
|
url = a['href']
|
||||||
|
title = a['title']
|
||||||
|
print title
|
||||||
|
description = ''
|
||||||
|
m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url)
|
||||||
|
skip = 1
|
||||||
|
if len(m.groups()) == 3:
|
||||||
|
g = m.groups()
|
||||||
|
dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d')
|
||||||
|
pubdate = time.strftime('%a, %d %b', dt.timetuple())
|
||||||
|
|
||||||
|
dt = time.mktime(dt.timetuple())
|
||||||
|
if dt >= today:
|
||||||
|
print pubdate
|
||||||
|
skip = 0
|
||||||
|
else:
|
||||||
|
pubdate = strftime('%a, %d %b')
|
||||||
|
|
||||||
|
summary = a.find(True, attrs={'class':'excerpt'})
|
||||||
|
if summary:
|
||||||
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
|
||||||
|
if skip == 0:
|
||||||
|
articles[key].append(
|
||||||
|
dict(title=title, url=url, date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
||||||
|
@ -1,224 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
##
|
|
||||||
## Title: Microwave and RF
|
|
||||||
##
|
|
||||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
|
||||||
|
|
||||||
# Feb 2012: Initial release
|
|
||||||
|
|
||||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
|
||||||
'''
|
|
||||||
mwrf.com
|
|
||||||
'''
|
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
from calibre.utils.magick import Image
|
|
||||||
|
|
||||||
class Microwaves_and_RF(BasicNewsRecipe):
|
|
||||||
|
|
||||||
Convert_Grayscale = False # Convert images to gray scale or not
|
|
||||||
|
|
||||||
# Add sections that want to be excluded from the magazine
|
|
||||||
exclude_sections = []
|
|
||||||
|
|
||||||
# Add sections that want to be included from the magazine
|
|
||||||
include_sections = []
|
|
||||||
|
|
||||||
title = u'Microwaves and RF'
|
|
||||||
__author__ = u'kiavash'
|
|
||||||
description = u'Microwaves and RF Montly Magazine'
|
|
||||||
publisher = 'Penton Media, Inc.'
|
|
||||||
publication_type = 'magazine'
|
|
||||||
site = 'http://mwrf.com'
|
|
||||||
|
|
||||||
language = 'en'
|
|
||||||
asciiize = True
|
|
||||||
timeout = 120
|
|
||||||
simultaneous_downloads = 1 # very peaky site!
|
|
||||||
|
|
||||||
# Main article is inside this tag
|
|
||||||
keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})]
|
|
||||||
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_javascript = True
|
|
||||||
|
|
||||||
# Flattens all the tables to make it compatible with Nook
|
|
||||||
conversion_options = {'linearize_tables' : True}
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='span', attrs={'class':'body12'}),
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
|
||||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
|
||||||
|
|
||||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
|
||||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
|
||||||
.introduction, .first { font-weight: bold; } \
|
|
||||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
|
||||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
|
||||||
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
|
||||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
|
||||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
|
||||||
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
|
||||||
.story-date, .published { font-size: 80%; } \
|
|
||||||
table { width: 100%; } \
|
|
||||||
td img { display: block; margin: 5px auto; } \
|
|
||||||
ul { padding-top: 10px; } \
|
|
||||||
ol { padding-top: 10px; } \
|
|
||||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
|
||||||
h1 { font-size: 175%; font-weight: bold; } \
|
|
||||||
h2 { font-size: 150%; font-weight: bold; } \
|
|
||||||
h3 { font-size: 125%; font-weight: bold; } \
|
|
||||||
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
|
||||||
|
|
||||||
# Remove the line breaks and float left/right and picture width/height.
|
|
||||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
|
||||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
|
||||||
(re.compile(r'float:.*?'), lambda m: ''),
|
|
||||||
(re.compile(r'width:.*?px'), lambda m: ''),
|
|
||||||
(re.compile(r'height:.*?px'), lambda m: '')
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
url = re.sub(r'.html', '', url)
|
|
||||||
url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url)
|
|
||||||
return url
|
|
||||||
|
|
||||||
# Need to change the user agent to avoid potential download errors
|
|
||||||
def get_browser(self, *args, **kwargs):
|
|
||||||
from calibre import browser
|
|
||||||
kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0'
|
|
||||||
return browser(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
|
||||||
|
|
||||||
# Fetches the main page of Microwaves and RF
|
|
||||||
soup = self.index_to_soup(self.site)
|
|
||||||
|
|
||||||
# First page has the ad, Let's find the redirect address.
|
|
||||||
url = soup.find('span', attrs={'class':'commonCopy'}).find('a').get('href')
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = self.site + url
|
|
||||||
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
|
|
||||||
# Searches the site for Issue ID link then returns the href address
|
|
||||||
# pointing to the latest issue
|
|
||||||
latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href')
|
|
||||||
|
|
||||||
# Fetches the index page for of the latest issue
|
|
||||||
soup = self.index_to_soup(latest_issue)
|
|
||||||
|
|
||||||
# Finds the main section of the page containing cover, issue date and
|
|
||||||
# TOC
|
|
||||||
ts = soup.find('div', attrs={'id':'columnContainer'})
|
|
||||||
|
|
||||||
# Finds the issue date
|
|
||||||
ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize()
|
|
||||||
self.log('Found Current Issue:', ds)
|
|
||||||
self.timefmt = ' [%s]'%ds
|
|
||||||
|
|
||||||
# Finds the cover image
|
|
||||||
cover = ts.find('img', src = lambda x: x and 'Cover' in x)
|
|
||||||
if cover is not None:
|
|
||||||
self.cover_url = self.site + cover['src']
|
|
||||||
self.log('Found Cover image:', self.cover_url)
|
|
||||||
|
|
||||||
feeds = []
|
|
||||||
article_info = []
|
|
||||||
|
|
||||||
# Finds all the articles (tiles and links)
|
|
||||||
articles = ts.findAll('a', attrs={'class':'commonArticleTitle'})
|
|
||||||
|
|
||||||
# Finds all the descriptions
|
|
||||||
descriptions = ts.findAll('span', attrs={'class':'commonCopy'})
|
|
||||||
|
|
||||||
# Find all the sections
|
|
||||||
sections = ts.findAll('span', attrs={'class':'kicker'})
|
|
||||||
|
|
||||||
title_number = 0
|
|
||||||
|
|
||||||
# Goes thru all the articles one by one and sort them out
|
|
||||||
for section in sections:
|
|
||||||
title_number = title_number + 1
|
|
||||||
|
|
||||||
# Removes the unwanted sections
|
|
||||||
if self.tag_to_string(section) in self.exclude_sections:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Only includes the wanted sections
|
|
||||||
if self.include_sections:
|
|
||||||
if self.tag_to_string(section) not in self.include_sections:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
title = self.tag_to_string(articles[title_number])
|
|
||||||
url = articles[title_number].get('href')
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = self.site + url
|
|
||||||
|
|
||||||
self.log('\tFound article:', title, 'at', url)
|
|
||||||
desc = self.tag_to_string(descriptions[title_number])
|
|
||||||
self.log('\t\t', desc)
|
|
||||||
|
|
||||||
article_info.append({'title':title, 'url':url, 'description':desc,
|
|
||||||
'date':self.timefmt})
|
|
||||||
|
|
||||||
if article_info:
|
|
||||||
feeds.append((self.title, article_info))
|
|
||||||
|
|
||||||
#self.log(feeds)
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
|
||||||
if self.Convert_Grayscale:
|
|
||||||
#process all the images
|
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
|
||||||
iurl = tag['src']
|
|
||||||
img = Image()
|
|
||||||
img.open(iurl)
|
|
||||||
if img < 0:
|
|
||||||
raise RuntimeError('Out of memory')
|
|
||||||
img.type = "GrayscaleType"
|
|
||||||
img.save(iurl)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
|
|
||||||
# Includes all the figures inside the final ebook
|
|
||||||
# Finds all the jpg links
|
|
||||||
for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}):
|
|
||||||
|
|
||||||
# makes sure that the link points to the absolute web address
|
|
||||||
if figure['href'].startswith('/'):
|
|
||||||
figure['href'] = self.site + figure['href']
|
|
||||||
|
|
||||||
figure.name = 'img' # converts the links to img
|
|
||||||
figure['src'] = figure['href'] # with the same address as href
|
|
||||||
figure['style'] = 'display:block' # adds /n before and after the image
|
|
||||||
del figure['href']
|
|
||||||
del figure['target']
|
|
||||||
|
|
||||||
# Makes the title standing out
|
|
||||||
for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}):
|
|
||||||
title.name = 'h1'
|
|
||||||
del title['href']
|
|
||||||
del title['target']
|
|
||||||
|
|
||||||
# Makes the section name more visible
|
|
||||||
for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}):
|
|
||||||
section_name.name = 'h5'
|
|
||||||
del section_name['href']
|
|
||||||
del section_name['target']
|
|
||||||
|
|
||||||
# Removes all unrelated links
|
|
||||||
for link in soup.findAll('a', attrs = {'href': True}):
|
|
||||||
link.name = 'font'
|
|
||||||
del link['href']
|
|
||||||
del link['target']
|
|
||||||
|
|
||||||
return soup
|
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
class Mlody_technik(BasicNewsRecipe):
|
class Mlody_technik(BasicNewsRecipe):
|
||||||
title = u'Mlody technik'
|
title = u'Młody technik'
|
||||||
__author__ = 'fenuks'
|
__author__ = 'fenuks'
|
||||||
description = u'Młody technik'
|
description = u'Młody technik'
|
||||||
category = 'science'
|
category = 'science'
|
||||||
|
27
recipes/mobile_bulgaria.recipe
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1329123365(BasicNewsRecipe):
|
||||||
|
title = u'Mobilebulgaria.com'
|
||||||
|
__author__ = 'M3 Web'
|
||||||
|
description = 'The biggest Bulgarian site covering mobile consumer electronics. Offers detailed reviews, popular discussion forum, shop and platform for selling new and second hand phones and gadgets.'
|
||||||
|
category = 'News, Reviews, Offers, Forum'
|
||||||
|
oldest_article = 45
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
language = 'bg'
|
||||||
|
encoding = 'windows-1251'
|
||||||
|
no_stylesheets = False
|
||||||
|
remove_javascript = True
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'bigblock'}),
|
||||||
|
dict(name='div', attrs={'class':'verybigblock'}),
|
||||||
|
dict(name='table', attrs={'class':'obiaviresults'}),
|
||||||
|
dict(name='div', attrs={'class':'forumblock'}),
|
||||||
|
dict(name='div', attrs={'class':'forumblock_b1'}),
|
||||||
|
dict(name='div', attrs={'class':'block2_2colswrap'})]
|
||||||
|
|
||||||
|
feeds = [(u'News', u'http://www.mobilebulgaria.com/rss_full.php'),
|
||||||
|
(u'Reviews', u'http://www.mobilebulgaria.com/rss_reviews.php'),
|
||||||
|
(u'Offers', u'http://www.mobilebulgaria.com/obiavi/rss.php'),
|
||||||
|
(u'Forum', u'http://www.mobilebulgaria.com/rss_forum_last10.php')]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
#gallery1 div{display: block; float: left; margin: 0 10px 10px 0;} '''
|
@ -66,8 +66,9 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
|
|||||||
self.log('Issue date:', date)
|
self.log('Issue date:', date)
|
||||||
|
|
||||||
# Find TOC
|
# Find TOC
|
||||||
toc = soup.find('ul', attrs={'class':'issue-article-list'})
|
tocs = soup.findAll('ul', attrs={'class':'issue-article-list'})
|
||||||
articles = []
|
articles = []
|
||||||
|
for toc in tocs:
|
||||||
for li in toc.findAll('li'):
|
for li in toc.findAll('li'):
|
||||||
h3 = li.find('h3')
|
h3 = li.find('h3')
|
||||||
title = self.tag_to_string(h3)
|
title = self.tag_to_string(h3)
|
||||||
|
@ -13,8 +13,11 @@ class NikkeiNet_paper_subscription(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 30
|
max_articles_per_feed = 30
|
||||||
language = 'ja'
|
language = 'ja'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
#cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
cover_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
|
||||||
|
#masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
masthead_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
|
||||||
|
cover_margins = (10, 188, '#ffffff')
|
||||||
|
|
||||||
remove_tags_before = {'class':"cmn-indent"}
|
remove_tags_before = {'class':"cmn-indent"}
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
@ -40,8 +43,11 @@ class NikkeiNet_paper_subscription(BasicNewsRecipe):
|
|||||||
print "-------------------------open top page-------------------------------------"
|
print "-------------------------open top page-------------------------------------"
|
||||||
br.open('http://www.nikkei.com/')
|
br.open('http://www.nikkei.com/')
|
||||||
print "-------------------------open first login form-----------------------------"
|
print "-------------------------open first login form-----------------------------"
|
||||||
link = br.links(url_regex="www.nikkei.com/etc/accounts/login").next()
|
try:
|
||||||
br.follow_link(link)
|
url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url
|
||||||
|
except StopIteration:
|
||||||
|
url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F'
|
||||||
|
br.open(url) #br.follow_link(link)
|
||||||
#response = br.response()
|
#response = br.response()
|
||||||
#print response.get_data()
|
#print response.get_data()
|
||||||
print "-------------------------JS redirect(send autoPostForm)--------------------"
|
print "-------------------------JS redirect(send autoPostForm)--------------------"
|
||||||
|
@ -15,7 +15,7 @@ class Nin(BasicNewsRecipe):
|
|||||||
publisher = 'NIN d.o.o. - Ringier d.o.o.'
|
publisher = 'NIN d.o.o. - Ringier d.o.o.'
|
||||||
category = 'news, politics, Serbia'
|
category = 'news, politics, Serbia'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
oldest_article = 15
|
oldest_article = 180
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
@ -25,7 +25,7 @@ class Nin(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'sr'
|
language = 'sr'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
masthead_url = 'http://www.nin.co.rs/img/head/logo.jpg'
|
masthead_url = 'http://www.nin.co.rs/img/logo_print.jpg'
|
||||||
extra_css = """
|
extra_css = """
|
||||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
body{font-family: Verdana, Lucida, sans1, sans-serif}
|
body{font-family: Verdana, Lucida, sans1, sans-serif}
|
||||||
@ -42,11 +42,11 @@ class Nin(BasicNewsRecipe):
|
|||||||
, 'tags' : category
|
, 'tags' : category
|
||||||
, 'publisher' : publisher
|
, 'publisher' : publisher
|
||||||
, 'language' : language
|
, 'language' : language
|
||||||
|
, 'linearize_tables': True
|
||||||
}
|
}
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile(r'</body>.*?<html>', re.DOTALL|re.IGNORECASE),lambda match: '</body>')
|
(re.compile(r'<div class="standardFont">.*', re.DOTALL|re.IGNORECASE),lambda match: '')
|
||||||
,(re.compile(r'</html>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</html>')
|
|
||||||
,(re.compile(u'\u0110'), lambda match: u'\u00D0')
|
,(re.compile(u'\u0110'), lambda match: u'\u00D0')
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -60,42 +60,21 @@ class Nin(BasicNewsRecipe):
|
|||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
keep_only_tags =[dict(name='td', attrs={'width':'520'})]
|
remove_tags_before = dict(name='div', attrs={'class':'titleFont'})
|
||||||
remove_tags_before =dict(name='span', attrs={'class':'izjava'})
|
remove_tags_after = dict(name='div', attrs={'class':'standardFont'})
|
||||||
remove_tags_after =dict(name='html')
|
remove_tags = [dict(name=['object','link','iframe','meta','base'])]
|
||||||
remove_tags = [
|
remove_attributes = ['border','background','height','width','align','valign']
|
||||||
dict(name=['object','link','iframe','meta','base'])
|
|
||||||
,dict(attrs={'class':['fb-like','twitter-share-button']})
|
|
||||||
,dict(attrs={'rel':'nofollow'})
|
|
||||||
]
|
|
||||||
remove_attributes=['border','background','height','width','align','valign']
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover_url = None
|
cover_url = None
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
for item in soup.findAll('a', href=True):
|
cover = soup.find('img', attrs={'class':'issueImg'})
|
||||||
if item['href'].startswith('/pages/issue.php?id='):
|
if cover:
|
||||||
simg = item.find('img')
|
return self.PREFIX + cover['src']
|
||||||
if simg:
|
|
||||||
return self.PREFIX + item.img['src']
|
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
|
feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def print_version(self, url):
|
||||||
for item in soup.findAll(style=True):
|
return url + '&pf=1'
|
||||||
del item['style']
|
|
||||||
for item in soup.findAll('div'):
|
|
||||||
if len(item.contents) == 0:
|
|
||||||
item.extract()
|
|
||||||
for item in soup.findAll(['td','tr']):
|
|
||||||
item.name='div'
|
|
||||||
for item in soup.findAll('img'):
|
|
||||||
if not item.has_key('alt'):
|
|
||||||
item['alt'] = 'image'
|
|
||||||
for tbl in soup.findAll('table'):
|
|
||||||
img = tbl.find('img')
|
|
||||||
if img:
|
|
||||||
img.extract()
|
|
||||||
tbl.replaceWith(img)
|
|
||||||
return soup
|
|
||||||
|
@ -6,7 +6,6 @@ www.nsfwcorp.com
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import urllib
|
import urllib
|
||||||
from calibre import strftime
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class NotSafeForWork(BasicNewsRecipe):
|
class NotSafeForWork(BasicNewsRecipe):
|
||||||
@ -21,8 +20,9 @@ class NotSafeForWork(BasicNewsRecipe):
|
|||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
auto_cleanup = False
|
auto_cleanup = False
|
||||||
INDEX = 'https://www.nsfwcorp.com'
|
INDEX = 'https://www.nsfwcorp.com'
|
||||||
LOGIN = INDEX + '/login'
|
LOGIN = INDEX + '/login/target/'
|
||||||
use_embedded_content = False
|
SETTINGS = INDEX + '/settings/'
|
||||||
|
use_embedded_content = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
masthead_url = 'http://assets.nsfwcorp.com/media/headers/nsfw_banner.jpg'
|
masthead_url = 'http://assets.nsfwcorp.com/media/headers/nsfw_banner.jpg'
|
||||||
@ -46,15 +46,6 @@ class NotSafeForWork(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tags_before = dict(attrs={'id':'fromToLine'})
|
|
||||||
remove_tags_after = dict(attrs={'id':'unlockButtonDiv'})
|
|
||||||
remove_tags=[
|
|
||||||
dict(name=['meta', 'link', 'iframe', 'embed', 'object'])
|
|
||||||
,dict(name='a', attrs={'class':'switchToDeskNotes'})
|
|
||||||
,dict(attrs={'id':'unlockButtonDiv'})
|
|
||||||
]
|
|
||||||
remove_attributes = ['lang']
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
br.open(self.LOGIN)
|
br.open(self.LOGIN)
|
||||||
@ -65,30 +56,12 @@ class NotSafeForWork(BasicNewsRecipe):
|
|||||||
br.open(self.LOGIN, data)
|
br.open(self.LOGIN, data)
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def parse_index(self):
|
def get_feeds(self):
|
||||||
articles = []
|
self.feeds = []
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.SETTINGS)
|
||||||
dispatches = soup.find(attrs={'id':'dispatches'})
|
for item in soup.findAll('input', attrs={'type':'text'}):
|
||||||
if dispatches:
|
if item.has_key('value') and item['value'].startswith('http://www.nsfwcorp.com/feed/'):
|
||||||
for item in dispatches.findAll('h3'):
|
self.feeds.append(item['value'])
|
||||||
description = u''
|
return self.feeds
|
||||||
title_link = item.find('span', attrs={'class':'dispatchTitle'})
|
return self.feeds
|
||||||
description_link = item.find('span', attrs={'class':'dispatchSubtitle'})
|
|
||||||
feed_link = item.find('a', href=True)
|
|
||||||
if feed_link:
|
|
||||||
url = self.INDEX + feed_link['href']
|
|
||||||
title = self.tag_to_string(title_link)
|
|
||||||
description = self.tag_to_string(description_link)
|
|
||||||
date = strftime(self.timefmt)
|
|
||||||
articles.append({
|
|
||||||
'title' :title
|
|
||||||
,'date' :date
|
|
||||||
,'url' :url
|
|
||||||
,'description':description
|
|
||||||
})
|
|
||||||
return [('Dispatches', articles)]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
return soup
|
|
||||||
|
@ -6,22 +6,50 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import re, string, time
|
import re, string, time
|
||||||
from calibre import entity_to_unicode, strftime
|
from calibre import strftime
|
||||||
from datetime import timedelta, date
|
from datetime import timedelta, date
|
||||||
|
from time import sleep
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
|
recursions=1 # set this to zero to omit Related articles lists
|
||||||
|
match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
|
||||||
|
|
||||||
|
# set getTechBlogs to True to include the technology blogs
|
||||||
|
# set tech_oldest_article to control article age
|
||||||
|
# set tech_max_articles_per_feed to control article count
|
||||||
|
getTechBlogs = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
tech_oldest_article = 14
|
||||||
|
tech_max_articles_per_feed = 25
|
||||||
|
|
||||||
|
# set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
|
||||||
|
# otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
|
||||||
|
getPopularArticles = True
|
||||||
|
popularPeriod = '1' # set this to the number of days to include in the measurement
|
||||||
|
# e.g. 7 will get the most popular measured over the last 7 days
|
||||||
|
# and 30 will get the most popular measured over 30 days.
|
||||||
|
# you still only get up to 20 articles in each category
|
||||||
|
|
||||||
|
|
||||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||||
headlinesOnly = True
|
headlinesOnly = True
|
||||||
|
|
||||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
# set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the
|
||||||
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
# number of days old an article can be for inclusion. If oldest_web_article = None all articles
|
||||||
# will be included. Note: oldest_article is ignored if webEdition = False
|
# will be included. Note: oldest_web_article is ignored if webEdition = False
|
||||||
webEdition = False
|
webEdition = False
|
||||||
oldest_article = 7
|
oldest_web_article = 7
|
||||||
|
|
||||||
|
# download higher resolution images than the small thumbnails typically included in the article
|
||||||
|
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
||||||
|
useHighResImages = True
|
||||||
|
|
||||||
|
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
||||||
|
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
||||||
|
replaceKindleVersion = False
|
||||||
|
|
||||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||||
# Otherwise, only the sections named will be included. For example,
|
# Otherwise, only the sections named will be included. For example,
|
||||||
@ -82,57 +110,68 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
('Education',u'education'),
|
('Education',u'education'),
|
||||||
('Multimedia',u'multimedia'),
|
('Multimedia',u'multimedia'),
|
||||||
(u'Obituaries',u'obituaries'),
|
(u'Obituaries',u'obituaries'),
|
||||||
(u'Sunday Magazine',u'magazine'),
|
(u'Sunday Magazine',u'magazine')
|
||||||
(u'Week in Review',u'weekinreview')]
|
]
|
||||||
|
|
||||||
|
tech_feeds = [
|
||||||
|
(u'Tech - Pogues Posts', u'http://pogue.blogs.nytimes.com/feed/'),
|
||||||
|
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
|
||||||
|
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
||||||
|
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
if headlinesOnly:
|
if headlinesOnly:
|
||||||
title='New York Times Headlines'
|
title='New York Times Headlines'
|
||||||
description = 'Headlines from the New York Times. Needs a subscription from http://www.nytimes.com'
|
description = 'Headlines from the New York Times'
|
||||||
needs_subscription = 'optional'
|
needs_subscription = False
|
||||||
elif webEdition:
|
elif webEdition:
|
||||||
title='New York Times (Web)'
|
title='New York Times (Web)'
|
||||||
description = 'New York Times on the Web'
|
description = 'New York Times on the Web'
|
||||||
needs_subscription = True
|
needs_subscription = False
|
||||||
|
elif replaceKindleVersion:
|
||||||
|
title='The New York Times'
|
||||||
|
description = 'Today\'s New York Times'
|
||||||
|
needs_subscription = False
|
||||||
else:
|
else:
|
||||||
title='New York Times'
|
title='New York Times'
|
||||||
description = 'Today\'s New York Times'
|
description = 'Today\'s New York Times'
|
||||||
needs_subscription = True
|
needs_subscription = False
|
||||||
|
|
||||||
|
def decode_url_date(self,url):
|
||||||
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
urlitems = url.split('/')
|
||||||
|
|
||||||
def decode_us_date(self,datestr):
|
|
||||||
udate = datestr.strip().lower().split()
|
|
||||||
try:
|
try:
|
||||||
m = self.month_list.index(udate[0])+1
|
d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5]))
|
||||||
except:
|
except:
|
||||||
return date.today()
|
|
||||||
d = int(udate[1])
|
|
||||||
y = int(udate[2])
|
|
||||||
try:
|
try:
|
||||||
d = date(y,m,d)
|
d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6]))
|
||||||
except:
|
except:
|
||||||
d = date.today
|
return None
|
||||||
return d
|
return d
|
||||||
|
|
||||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
if oldest_web_article is None:
|
||||||
|
earliest_date = date.today()
|
||||||
|
else:
|
||||||
|
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||||
|
oldest_article = 365 # by default, a long time ago
|
||||||
|
|
||||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
requires_version = (0, 7, 5)
|
requires_version = (0, 7, 5)
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
|
||||||
|
#simultaneous_downloads = 1 # no longer required to deal with ads
|
||||||
|
|
||||||
cover_margins = (18,18,'grey99')
|
cover_margins = (18,18,'grey99')
|
||||||
|
|
||||||
remove_tags_before = dict(id='article')
|
remove_tags_before = dict(id='article')
|
||||||
remove_tags_after = dict(id='article')
|
remove_tags_after = dict(id='article')
|
||||||
remove_tags = [dict(attrs={'class':[
|
remove_tags = [
|
||||||
|
dict(attrs={'class':[
|
||||||
'articleFooter',
|
'articleFooter',
|
||||||
'articleTools',
|
'articleTools',
|
||||||
'columnGroup doubleRule',
|
|
||||||
'columnGroup singleRule',
|
'columnGroup singleRule',
|
||||||
'columnGroup last',
|
'columnGroup last',
|
||||||
'columnGroup last',
|
'columnGroup last',
|
||||||
@ -140,7 +179,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'dottedLine',
|
'dottedLine',
|
||||||
'entry-meta',
|
'entry-meta',
|
||||||
'entry-response module',
|
'entry-response module',
|
||||||
'icon enlargeThis',
|
|
||||||
'leftNavTabs',
|
'leftNavTabs',
|
||||||
'metaFootnote',
|
'metaFootnote',
|
||||||
'module box nav',
|
'module box nav',
|
||||||
@ -150,10 +188,44 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'relatedSearchesModule',
|
'relatedSearchesModule',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'singleAd',
|
'singleAd',
|
||||||
|
'entry entry-utility', #added for DealBook
|
||||||
|
'entry-tags', #added for DealBook
|
||||||
|
'footer promos clearfix', #added for DealBook
|
||||||
|
'footer links clearfix', #added for DealBook
|
||||||
|
'tabsContainer', #added for other blog downloads
|
||||||
|
'column lastColumn', #added for other blog downloads
|
||||||
|
'pageHeaderWithLabel', #added for other gadgetwise downloads
|
||||||
|
'column two', #added for other blog downloads
|
||||||
|
'column two last', #added for other blog downloads
|
||||||
|
'column three', #added for other blog downloads
|
||||||
|
'column three last', #added for other blog downloads
|
||||||
|
'column four',#added for other blog downloads
|
||||||
|
'column four last',#added for other blog downloads
|
||||||
|
'column last', #added for other blog downloads
|
||||||
|
'entry entry-related',
|
||||||
|
'subNavigation tabContent active', #caucus blog navigation
|
||||||
|
'mediaOverlay slideshow',
|
||||||
|
'wideThumb',
|
||||||
|
'video', #added 02-11-2011
|
||||||
|
'videoHeader',#added 02-11-2011
|
||||||
|
'articleInlineVideoHolder', #added 02-11-2011
|
||||||
|
'assetCompanionAd',
|
||||||
re.compile('^subNavigation'),
|
re.compile('^subNavigation'),
|
||||||
re.compile('^leaderboard'),
|
re.compile('^leaderboard'),
|
||||||
re.compile('^module'),
|
re.compile('^module'),
|
||||||
|
re.compile('commentCount'),
|
||||||
|
'credit'
|
||||||
]}),
|
]}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||||
|
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||||
|
dict(name='div', attrs={'class':'tweet'}),
|
||||||
|
dict(name='span', attrs={'class':'commentCount meta'}),
|
||||||
|
dict(name='div', attrs={'id':'header'}),
|
||||||
|
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
||||||
|
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
||||||
|
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
||||||
|
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
||||||
|
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
||||||
dict(id=[
|
dict(id=[
|
||||||
'adxLeaderboard',
|
'adxLeaderboard',
|
||||||
'adxSponLink',
|
'adxSponLink',
|
||||||
@ -183,22 +255,29 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'side_index',
|
'side_index',
|
||||||
'side_tool',
|
'side_tool',
|
||||||
'toolsRight',
|
'toolsRight',
|
||||||
|
'skybox', #added for DealBook
|
||||||
|
'TopAd', #added for DealBook
|
||||||
|
'related-content', #added for DealBook
|
||||||
]),
|
]),
|
||||||
dict(name=['script', 'noscript', 'style','form','hr'])]
|
dict(name=['script', 'noscript', 'style','form','hr'])]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||||
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.timestamp { text-align: left; font-size: small; }
|
.timestamp { font-weight: normal; text-align: left; font-size: 50%; }
|
||||||
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
a:link {text-decoration: none; }
|
a:link {text-decoration: none; }
|
||||||
|
.date{font-size: 50%; }
|
||||||
|
.update{font-size: 50%; }
|
||||||
.articleBody { }
|
.articleBody { }
|
||||||
.authorId {text-align: left; }
|
.authorId {text-align: left; font-size: 50%; }
|
||||||
.image {text-align: center;}
|
.image {text-align: center;}
|
||||||
.source {text-align: left; }'''
|
.aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;}
|
||||||
|
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
||||||
|
.source {text-align: left; font-size: x-small; }'''
|
||||||
|
|
||||||
|
|
||||||
articles = {}
|
articles = {}
|
||||||
@ -222,11 +301,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
del ans[idx]
|
del ans[idx]
|
||||||
idx_max = idx_max-1
|
idx_max = idx_max-1
|
||||||
continue
|
continue
|
||||||
if self.verbose:
|
if True: #self.verbose
|
||||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||||
for article in ans[idx][1]:
|
for article in ans[idx][1]:
|
||||||
total_article_count += 1
|
total_article_count += 1
|
||||||
if self.verbose:
|
if True: #self.verbose
|
||||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
article['url'].encode('cp1252','replace')))
|
article['url'].encode('cp1252','replace')))
|
||||||
idx = idx+1
|
idx = idx+1
|
||||||
@ -237,7 +316,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
def exclude_url(self,url):
|
def exclude_url(self,url):
|
||||||
if not url.startswith("http"):
|
if not url.startswith("http"):
|
||||||
return True
|
return True
|
||||||
if not url.endswith(".html"):
|
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
||||||
return True
|
return True
|
||||||
if 'nytimes.com' not in url:
|
if 'nytimes.com' not in url:
|
||||||
return True
|
return True
|
||||||
@ -280,88 +359,78 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
if self.username is not None and self.password is not None:
|
|
||||||
br.open('http://www.nytimes.com/auth/login')
|
|
||||||
br.form = br.forms().next()
|
|
||||||
br['userid'] = self.username
|
|
||||||
br['password'] = self.password
|
|
||||||
raw = br.submit().read()
|
|
||||||
if 'Please try again' in raw:
|
|
||||||
raise Exception('Your username and password are incorrect')
|
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
cover_tag = 'NY_NYT'
|
||||||
# Skip ad pages served before actual article
|
|
||||||
skip_tag = soup.find(True, {'name':'skip'})
|
|
||||||
if skip_tag is not None:
|
|
||||||
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
|
||||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
|
||||||
url += '?pagewanted=all'
|
|
||||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
|
||||||
return self.index_to_soup(url, raw=True)
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover = None
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||||||
st = time.localtime()
|
br = BasicNewsRecipe.get_browser()
|
||||||
year = str(st.tm_year)
|
daysback=1
|
||||||
month = "%.2d" % st.tm_mon
|
try:
|
||||||
day = "%.2d" % st.tm_mday
|
br.open(cover)
|
||||||
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
|
except:
|
||||||
|
while daysback<7:
|
||||||
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg'
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
try:
|
try:
|
||||||
br.open(cover)
|
br.open(cover)
|
||||||
except:
|
except:
|
||||||
|
daysback = daysback+1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
if daysback==7:
|
||||||
self.log("\nCover unavailable")
|
self.log("\nCover unavailable")
|
||||||
cover = None
|
cover = None
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
|
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||||
|
|
||||||
|
|
||||||
def short_title(self):
|
def short_title(self):
|
||||||
return self.title
|
return self.title
|
||||||
|
|
||||||
def index_to_soup(self, url_or_raw, raw=False):
|
|
||||||
'''
|
def article_to_soup(self, url_or_raw, raw=False):
|
||||||
OVERRIDE of class method
|
from contextlib import closing
|
||||||
deals with various page encodings between index and articles
|
import copy
|
||||||
'''
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
print("ARTICLE_TO_SOUP "+url_or_raw)
|
||||||
if re.match(r'\w+://', url_or_raw):
|
if re.match(r'\w+://', url_or_raw):
|
||||||
br = self.clone_browser(self.browser)
|
br = self.clone_browser(self.browser)
|
||||||
f = br.open_novisit(url_or_raw)
|
open_func = getattr(br, 'open_novisit', br.open)
|
||||||
|
with closing(open_func(url_or_raw)) as f:
|
||||||
_raw = f.read()
|
_raw = f.read()
|
||||||
f.close()
|
|
||||||
if not _raw:
|
if not _raw:
|
||||||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||||
else:
|
else:
|
||||||
_raw = url_or_raw
|
_raw = url_or_raw
|
||||||
if raw:
|
if raw:
|
||||||
return _raw
|
return _raw
|
||||||
|
|
||||||
if not isinstance(_raw, unicode) and self.encoding:
|
if not isinstance(_raw, unicode) and self.encoding:
|
||||||
_raw = _raw.decode(docEncoding, 'replace')
|
if callable(self.encoding):
|
||||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
_raw = self.encoding(_raw)
|
||||||
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
else:
|
||||||
return BeautifulSoup(_raw, markupMassage=massage)
|
_raw = _raw.decode(self.encoding, 'replace')
|
||||||
|
|
||||||
# Entry point
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
soup = get_the_soup( self.encoding, url_or_raw )
|
nmassage.extend(self.preprocess_regexps)
|
||||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')]
|
||||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
# Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||||
if docEncoding == '' :
|
# Remove comments as they can leave detritus when extracting tags leaves
|
||||||
docEncoding = self.encoding
|
# multiple nested comments
|
||||||
|
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||||
|
usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0]
|
||||||
|
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
||||||
|
return BeautifulSoup(usrc, markupMassage=nmassage)
|
||||||
|
|
||||||
if self.verbose > 2:
|
|
||||||
self.log( " document encoding: '%s'" % docEncoding)
|
|
||||||
if docEncoding != self.encoding :
|
|
||||||
soup = get_the_soup(docEncoding, url_or_raw)
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def massageNCXText(self, description):
|
def massageNCXText(self, description):
|
||||||
# Kindle TOC descriptions won't render certain characters
|
# Kindle TOC descriptions won't render certain characters
|
||||||
if description:
|
if description:
|
||||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
# Replace '&' with '&'
|
# Replace '&' with '&'
|
||||||
massaged = re.sub("&","&", massaged)
|
massaged = re.sub("&","&", massaged)
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
return self.fixChars(massaged)
|
return self.fixChars(massaged)
|
||||||
else:
|
else:
|
||||||
return description
|
return description
|
||||||
@ -383,6 +452,16 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if self.filterDuplicates:
|
if self.filterDuplicates:
|
||||||
if url in self.url_list:
|
if url in self.url_list:
|
||||||
return
|
return
|
||||||
|
if self.webEdition:
|
||||||
|
date_tag = self.decode_url_date(url)
|
||||||
|
if date_tag is not None:
|
||||||
|
if self.oldest_web_article is not None:
|
||||||
|
if date_tag < self.earliest_date:
|
||||||
|
self.log("Skipping article %s" % url)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
self.log("Skipping article %s" % url)
|
||||||
|
return
|
||||||
self.url_list.append(url)
|
self.url_list.append(url)
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
description = ''
|
description = ''
|
||||||
@ -407,6 +486,92 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
description=description, author=author,
|
description=description, author=author,
|
||||||
content=''))
|
content=''))
|
||||||
|
|
||||||
|
def get_popular_articles(self,ans):
|
||||||
|
if self.getPopularArticles:
|
||||||
|
popular_articles = {}
|
||||||
|
key_list = []
|
||||||
|
|
||||||
|
def handleh3(h3tag):
|
||||||
|
try:
|
||||||
|
url = h3tag.a['href']
|
||||||
|
except:
|
||||||
|
return ('','','','')
|
||||||
|
url = re.sub(r'\?.*', '', url)
|
||||||
|
if self.exclude_url(url):
|
||||||
|
return ('','','','')
|
||||||
|
url += '?pagewanted=all'
|
||||||
|
title = self.tag_to_string(h3tag.a,False)
|
||||||
|
h6tag = h3tag.findNextSibling('h6')
|
||||||
|
if h6tag is not None:
|
||||||
|
author = self.tag_to_string(h6tag,False)
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
ptag = h3tag.findNextSibling('p')
|
||||||
|
if ptag is not None:
|
||||||
|
desc = self.tag_to_string(ptag,False)
|
||||||
|
else:
|
||||||
|
desc = ''
|
||||||
|
return(title,url,author,desc)
|
||||||
|
|
||||||
|
|
||||||
|
have_emailed = False
|
||||||
|
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
|
||||||
|
for h3tag in emailed_soup.findAll('h3'):
|
||||||
|
(title,url,author,desc) = handleh3(h3tag)
|
||||||
|
if url=='':
|
||||||
|
continue
|
||||||
|
if not have_emailed:
|
||||||
|
key_list.append('Most E-Mailed')
|
||||||
|
popular_articles['Most E-Mailed'] = []
|
||||||
|
have_emailed = True
|
||||||
|
popular_articles['Most E-Mailed'].append(
|
||||||
|
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
||||||
|
description=desc, author=author,
|
||||||
|
content=''))
|
||||||
|
have_viewed = False
|
||||||
|
viewed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-viewed?period='+self.popularPeriod)
|
||||||
|
for h3tag in viewed_soup.findAll('h3'):
|
||||||
|
(title,url,author,desc) = handleh3(h3tag)
|
||||||
|
if url=='':
|
||||||
|
continue
|
||||||
|
if not have_viewed:
|
||||||
|
key_list.append('Most Viewed')
|
||||||
|
popular_articles['Most Viewed'] = []
|
||||||
|
have_viewed = True
|
||||||
|
popular_articles['Most Viewed'].append(
|
||||||
|
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
||||||
|
description=desc, author=author,
|
||||||
|
content=''))
|
||||||
|
viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
|
||||||
|
for x in viewed_ans:
|
||||||
|
ans.append(x)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def get_tech_feeds(self,ans):
|
||||||
|
if self.getTechBlogs:
|
||||||
|
tech_articles = {}
|
||||||
|
key_list = []
|
||||||
|
save_oldest_article = self.oldest_article
|
||||||
|
save_max_articles_per_feed = self.max_articles_per_feed
|
||||||
|
self.oldest_article = self.tech_oldest_article
|
||||||
|
self.max_articles_per_feed = self.tech_max_articles_per_feed
|
||||||
|
self.feeds = self.tech_feeds
|
||||||
|
tech = self.parse_feeds()
|
||||||
|
self.oldest_article = save_oldest_article
|
||||||
|
self.max_articles_per_feed = save_max_articles_per_feed
|
||||||
|
self.feeds = None
|
||||||
|
for f in tech:
|
||||||
|
key_list.append(f.title)
|
||||||
|
tech_articles[f.title] = []
|
||||||
|
for a in f.articles:
|
||||||
|
tech_articles[f.title].append(
|
||||||
|
dict(title=a.title, url=a.url, date=a.date,
|
||||||
|
description=a.summary, author=a.author,
|
||||||
|
content=a.content))
|
||||||
|
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
|
||||||
|
for x in tech_ans:
|
||||||
|
ans.append(x)
|
||||||
|
return ans
|
||||||
|
|
||||||
def parse_web_edition(self):
|
def parse_web_edition(self):
|
||||||
|
|
||||||
@ -418,31 +583,41 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if sec_title in self.excludeSections:
|
if sec_title in self.excludeSections:
|
||||||
print "SECTION EXCLUDED: ",sec_title
|
print "SECTION EXCLUDED: ",sec_title
|
||||||
continue
|
continue
|
||||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
try:
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||||
|
|
||||||
self.key = sec_title
|
self.key = sec_title
|
||||||
# Find each article
|
# Find each article
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
if div['class'] in ['story', 'story headline'] :
|
if div['class'] in ['story', 'story headline', 'storyHeader'] :
|
||||||
self.handle_article(div)
|
self.handle_article(div)
|
||||||
|
elif div['class'] == 'ledeStory':
|
||||||
|
divsub = div.find('div','storyHeader')
|
||||||
|
if divsub is not None:
|
||||||
|
self.handle_article(divsub)
|
||||||
|
ulrefer = div.find('ul','refer')
|
||||||
|
if ulrefer is not None:
|
||||||
|
for lidiv in ulrefer.findAll('li'):
|
||||||
|
self.handle_article(lidiv)
|
||||||
elif div['class'] == 'headlinesOnly multiline flush':
|
elif div['class'] == 'headlinesOnly multiline flush':
|
||||||
for lidiv in div.findAll('li'):
|
for lidiv in div.findAll('li'):
|
||||||
self.handle_article(lidiv)
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
|
|
||||||
def parse_todays_index(self):
|
def parse_todays_index(self):
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
|
|
||||||
skipping = False
|
skipping = False
|
||||||
# Find each article
|
# Find each article
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
|
|
||||||
if div['class'] in ['section-headline','sectionHeader']:
|
if div['class'] in ['section-headline','sectionHeader']:
|
||||||
self.key = string.capwords(self.feed_title(div))
|
self.key = string.capwords(self.feed_title(div))
|
||||||
self.key = self.key.replace('Op-ed','Op-Ed')
|
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||||
@ -466,7 +641,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.handle_article(lidiv)
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
def parse_headline_index(self):
|
def parse_headline_index(self):
|
||||||
|
|
||||||
@ -514,7 +689,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for h3_item in search_div.findAll('h3'):
|
for h3_item in search_div.findAll('h3'):
|
||||||
byline = h3_item.h6
|
byline = h3_item.h6
|
||||||
if byline is not None:
|
if byline is not None:
|
||||||
author = self.tag_to_string(byline,usa_alt=False)
|
author = self.tag_to_string(byline,use_alt=False)
|
||||||
else:
|
else:
|
||||||
author = ''
|
author = ''
|
||||||
a = h3_item.find('a', href=True)
|
a = h3_item.find('a', href=True)
|
||||||
@ -540,7 +715,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
if self.headlinesOnly:
|
if self.headlinesOnly:
|
||||||
@ -550,32 +725,191 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
return self.parse_todays_index()
|
return self.parse_todays_index()
|
||||||
|
|
||||||
def strip_anchors(self,soup):
|
def strip_anchors(self,soup,kill_all=False):
|
||||||
paras = soup.findAll(True)
|
paras = soup.findAll(True)
|
||||||
for para in paras:
|
for para in paras:
|
||||||
aTags = para.findAll('a')
|
aTags = para.findAll('a')
|
||||||
for a in aTags:
|
for a in aTags:
|
||||||
if a.img is None:
|
if a.img is None:
|
||||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
if kill_all or (self.recursions==0):
|
||||||
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
|
else:
|
||||||
|
if a.has_key('href'):
|
||||||
|
if a['href'].startswith('http://www.nytimes'):
|
||||||
|
if not a['href'].endswith('pagewanted=all'):
|
||||||
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
|
if self.exclude_url(url):
|
||||||
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
|
else:
|
||||||
|
a['href'] = url+'?pagewanted=all'
|
||||||
|
elif not (a['href'].startswith('http://pogue') or \
|
||||||
|
a['href'].startswith('http://bits') or \
|
||||||
|
a['href'].startswith('http://travel') or \
|
||||||
|
a['href'].startswith('http://business') or \
|
||||||
|
a['href'].startswith('http://tech') or \
|
||||||
|
a['href'].startswith('http://health') or \
|
||||||
|
a['href'].startswith('http://dealbook') or \
|
||||||
|
a['href'].startswith('http://open')):
|
||||||
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def handle_tags(self,soup):
|
||||||
|
try:
|
||||||
|
print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title))
|
||||||
|
except:
|
||||||
|
print("HANDLE TAGS: NO TITLE")
|
||||||
|
if soup is None:
|
||||||
|
print("ERROR: handle_tags received NoneType")
|
||||||
|
return None
|
||||||
|
|
||||||
|
## print("HANDLING AD FORWARD:")
|
||||||
|
## print(soup)
|
||||||
|
if self.keep_only_tags:
|
||||||
|
body = Tag(soup, 'body')
|
||||||
|
try:
|
||||||
|
if isinstance(self.keep_only_tags, dict):
|
||||||
|
self.keep_only_tags = [self.keep_only_tags]
|
||||||
|
for spec in self.keep_only_tags:
|
||||||
|
for tag in soup.find('body').findAll(**spec):
|
||||||
|
body.insert(len(body.contents), tag)
|
||||||
|
soup.find('body').replaceWith(body)
|
||||||
|
except AttributeError: # soup has no body element
|
||||||
|
pass
|
||||||
|
|
||||||
|
def remove_beyond(tag, next):
|
||||||
|
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||||
|
after = getattr(tag, next)
|
||||||
|
while after is not None:
|
||||||
|
ns = getattr(tag, next)
|
||||||
|
after.extract()
|
||||||
|
after = ns
|
||||||
|
tag = tag.parent
|
||||||
|
|
||||||
|
if self.remove_tags_after is not None:
|
||||||
|
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||||||
|
for spec in rt:
|
||||||
|
tag = soup.find(**spec)
|
||||||
|
remove_beyond(tag, 'nextSibling')
|
||||||
|
|
||||||
|
if self.remove_tags_before is not None:
|
||||||
|
tag = soup.find(**self.remove_tags_before)
|
||||||
|
remove_beyond(tag, 'previousSibling')
|
||||||
|
|
||||||
|
for kwds in self.remove_tags:
|
||||||
|
for tag in soup.findAll(**kwds):
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||||
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
|
if skip_tag is not None:
|
||||||
|
#url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||||
|
url = 'http://www.nytimes.com' + skip_tag.parent['href']
|
||||||
|
#url += '?pagewanted=all'
|
||||||
|
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||||
|
sleep(5)
|
||||||
|
soup = self.handle_tags(self.article_to_soup(url))
|
||||||
|
|
||||||
if self.webEdition & (self.oldest_article>0):
|
# check if the article is from one of the tech blogs
|
||||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']})
|
||||||
if date_tag:
|
|
||||||
date_str = self.tag_to_string(date_tag,use_alt=False)
|
if blog is not None:
|
||||||
date_str = date_str.replace('Published:','')
|
old_body = soup.find('body')
|
||||||
date_items = date_str.split(',')
|
new_body=Tag(soup,'body')
|
||||||
try:
|
new_body.append(soup.find('div',attrs={'id':'content'}))
|
||||||
datestring = date_items[0]+' '+date_items[1]
|
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
||||||
article_date = self.decode_us_date(datestring)
|
old_body.replaceWith(new_body)
|
||||||
except:
|
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
||||||
article_date = date.today()
|
if divr.find(text=re.compile('Sign up')):
|
||||||
if article_date < self.earliest_date:
|
divr.extract()
|
||||||
self.log("Skipping article dated %s" % date_str)
|
divr = soup.find('div',attrs={'id':re.compile('related-content')})
|
||||||
return None
|
if divr is not None:
|
||||||
|
# handle related articles
|
||||||
|
rlist = []
|
||||||
|
ul = divr.find('ul')
|
||||||
|
if ul is not None:
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
atag = li.find('a')
|
||||||
|
if atag is not None:
|
||||||
|
if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \
|
||||||
|
atag['href'].startswith('http://open'):
|
||||||
|
atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False)))
|
||||||
|
rlist.append(atag)
|
||||||
|
divr.extract()
|
||||||
|
if rlist != []:
|
||||||
|
asidediv = Tag(soup,'div',[('class','aside')])
|
||||||
|
if soup.find('hr') is None:
|
||||||
|
asidediv.append(Tag(soup,'hr'))
|
||||||
|
h4 = Tag(soup,'h4',[('class','asidenote')])
|
||||||
|
h4.insert(0,"Related Posts")
|
||||||
|
asidediv.append(h4)
|
||||||
|
ul = Tag(soup,'ul')
|
||||||
|
for r in rlist:
|
||||||
|
li = Tag(soup,'li',[('class','aside')])
|
||||||
|
r['class'] = 'aside'
|
||||||
|
li.append(r)
|
||||||
|
ul.append(li)
|
||||||
|
asidediv.append(ul)
|
||||||
|
asidediv.append(Tag(soup,'hr'))
|
||||||
|
smain = soup.find('body')
|
||||||
|
smain.append(asidediv)
|
||||||
|
for atag in soup.findAll('a'):
|
||||||
|
img = atag.find('img')
|
||||||
|
if img is not None:
|
||||||
|
atag.replaceWith(img)
|
||||||
|
elif not atag.has_key('href'):
|
||||||
|
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||||
|
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
|
||||||
|
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
||||||
|
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||||
|
hdr = soup.find('address')
|
||||||
|
if hdr is not None:
|
||||||
|
hdr.name='span'
|
||||||
|
for span_credit in soup.findAll('span','credit'):
|
||||||
|
sp = Tag(soup,'span')
|
||||||
|
span_credit.replaceWith(sp)
|
||||||
|
sp.append(Tag(soup,'br'))
|
||||||
|
sp.append(span_credit)
|
||||||
|
sp.append(Tag(soup,'br'))
|
||||||
|
|
||||||
|
else: # nytimes article
|
||||||
|
|
||||||
|
related = [] # these will be the related articles
|
||||||
|
first_outer = None # first related outer tag
|
||||||
|
first_related = None # first related tag
|
||||||
|
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||||
|
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
||||||
|
if rdiv.find('h3') is not None:
|
||||||
|
if self.tag_to_string(rdiv.h3,False).startswith('Related'):
|
||||||
|
rdiv.h3.find(text=True).replaceWith("Related articles")
|
||||||
|
rdiv.h3['class'] = 'asidenote'
|
||||||
|
for litag in rdiv.findAll('li'):
|
||||||
|
if litag.find('a') is not None:
|
||||||
|
if litag.find('a')['href'].startswith('http://www.nytimes.com'):
|
||||||
|
url = re.sub(r'\?.*', '', litag.find('a')['href'])
|
||||||
|
litag.find('a')['href'] = url+'?pagewanted=all'
|
||||||
|
litag.extract()
|
||||||
|
related.append(litag)
|
||||||
|
if first_related is None:
|
||||||
|
first_related = rdiv
|
||||||
|
first_outer = outerdiv
|
||||||
|
else:
|
||||||
|
litag.extract()
|
||||||
|
if related != []:
|
||||||
|
for r in related:
|
||||||
|
if r.h6: # don't want the anchor inside a h6 tag
|
||||||
|
r.h6.replaceWith(r.h6.a)
|
||||||
|
first_related.ul.append(r)
|
||||||
|
first_related.insert(0,Tag(soup,'hr'))
|
||||||
|
first_related.append(Tag(soup,'hr'))
|
||||||
|
first_related['class'] = 'aside'
|
||||||
|
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
||||||
|
|
||||||
|
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||||
|
rdiv.extract()
|
||||||
|
|
||||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||||
if kicker_tag: # remove Op_Ed author head shots
|
if kicker_tag: # remove Op_Ed author head shots
|
||||||
@ -584,9 +918,78 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
img_div = soup.find('div','inlineImage module')
|
img_div = soup.find('div','inlineImage module')
|
||||||
if img_div:
|
if img_div:
|
||||||
img_div.extract()
|
img_div.extract()
|
||||||
return self.strip_anchors(soup)
|
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
if self.useHighResImages:
|
||||||
|
try:
|
||||||
|
#open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
|
||||||
|
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||||
|
if enlargeThisList:
|
||||||
|
for popupref in enlargeThisList:
|
||||||
|
popupreflink = popupref.find('a')
|
||||||
|
if popupreflink:
|
||||||
|
reflinkstring = str(popupreflink['href'])
|
||||||
|
refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
|
||||||
|
refend = reflinkstring.find(".html", refstart) + len(".html")
|
||||||
|
reflinkstring = reflinkstring[refstart:refend]
|
||||||
|
|
||||||
|
popuppage = self.browser.open(reflinkstring)
|
||||||
|
popuphtml = popuppage.read()
|
||||||
|
popuppage.close()
|
||||||
|
if popuphtml:
|
||||||
|
st = time.localtime()
|
||||||
|
year = str(st.tm_year)
|
||||||
|
month = "%.2d" % st.tm_mon
|
||||||
|
day = "%.2d" % st.tm_mday
|
||||||
|
imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
|
||||||
|
highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
|
||||||
|
popupSoup = BeautifulSoup(popuphtml)
|
||||||
|
highResTag = popupSoup.find('img', {'src':highResImageLink})
|
||||||
|
if highResTag:
|
||||||
|
try:
|
||||||
|
newWidth = highResTag['width']
|
||||||
|
newHeight = highResTag['height']
|
||||||
|
imageTag = popupref.parent.find("img")
|
||||||
|
except:
|
||||||
|
self.log("Error: finding width and height of img")
|
||||||
|
popupref.extract()
|
||||||
|
if imageTag:
|
||||||
|
try:
|
||||||
|
imageTag['src'] = highResImageLink
|
||||||
|
imageTag['width'] = newWidth
|
||||||
|
imageTag['height'] = newHeight
|
||||||
|
except:
|
||||||
|
self.log("Error setting the src width and height parameters")
|
||||||
|
except Exception:
|
||||||
|
self.log("Error pulling high resolution images")
|
||||||
|
|
||||||
|
try:
|
||||||
|
#in case pulling images failed, delete the enlarge this text
|
||||||
|
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||||
|
if enlargeThisList:
|
||||||
|
for popupref in enlargeThisList:
|
||||||
|
popupref.extract()
|
||||||
|
except:
|
||||||
|
self.log("Error removing Enlarge this text")
|
||||||
|
|
||||||
|
|
||||||
|
return self.strip_anchors(soup,False)
|
||||||
|
|
||||||
|
def postprocess_html(self,soup,first_fetch):
|
||||||
|
if not first_fetch: # remove Related links
|
||||||
|
for aside in soup.findAll('div','aside'):
|
||||||
|
aside.extract()
|
||||||
|
soup = self.strip_anchors(soup,True)
|
||||||
|
#print("RECURSIVE: "+self.tag_to_string(soup.title))
|
||||||
|
|
||||||
|
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
||||||
|
if first_fetch:
|
||||||
|
aside = soup.find('div','aside')
|
||||||
|
if aside is not None: # move the related list to the end of the article
|
||||||
|
art = soup.find('div',attrs={'id':'article'})
|
||||||
|
if art is None:
|
||||||
|
art = soup.find('div',attrs={'class':'article'})
|
||||||
|
if art is not None:
|
||||||
|
art.append(aside)
|
||||||
try:
|
try:
|
||||||
if self.one_picture_per_article:
|
if self.one_picture_per_article:
|
||||||
# Remove all images after first
|
# Remove all images after first
|
||||||
@ -642,6 +1045,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
try:
|
try:
|
||||||
# Change <nyt_headline> to <h2>
|
# Change <nyt_headline> to <h2>
|
||||||
h1 = soup.find('h1')
|
h1 = soup.find('h1')
|
||||||
|
blogheadline = str(h1) #added for dealbook
|
||||||
if h1:
|
if h1:
|
||||||
headline = h1.find("nyt_headline")
|
headline = h1.find("nyt_headline")
|
||||||
if headline:
|
if headline:
|
||||||
@ -649,13 +1053,19 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(headline.contents[0]))
|
||||||
h1.replaceWith(tag)
|
h1.replaceWith(tag)
|
||||||
|
elif blogheadline.find('entry-title'):#added for dealbook
|
||||||
|
tag = Tag(soup, "h2")#added for dealbook
|
||||||
|
tag['class'] = "headline"#added for dealbook
|
||||||
|
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
|
||||||
|
h1.replaceWith(tag)#added for dealbook
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Blog entry - replace headline, remove <hr> tags
|
# Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
|
||||||
headline = soup.find('title')
|
headline = soup.find('title')
|
||||||
if headline:
|
if headline:
|
||||||
tag = Tag(soup, "h2")
|
tag = Tag(soup, "h2")
|
||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.contents[0]))
|
tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
|
||||||
soup.insert(0, tag)
|
soup.insert(0, tag)
|
||||||
hrs = soup.findAll('hr')
|
hrs = soup.findAll('hr')
|
||||||
for hr in hrs:
|
for hr in hrs:
|
||||||
@ -663,6 +1073,29 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
|
||||||
|
|
||||||
|
try:
|
||||||
|
#if this is from a blog (dealbook, fix the byline format
|
||||||
|
bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
|
||||||
|
if bylineauthor:
|
||||||
|
tag = Tag(soup, "h6")
|
||||||
|
tag['class'] = "byline"
|
||||||
|
tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
|
||||||
|
bylineauthor.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: fixing byline author format")
|
||||||
|
|
||||||
|
try:
|
||||||
|
#if this is a blog (dealbook) fix the credit style for the pictures
|
||||||
|
blogcredit = soup.find('div',attrs={'class':'credit'})
|
||||||
|
if blogcredit:
|
||||||
|
tag = Tag(soup, "h6")
|
||||||
|
tag['class'] = "credit"
|
||||||
|
tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
|
||||||
|
blogcredit.replaceWith(tag)
|
||||||
|
except:
|
||||||
|
self.log("ERROR: fixing credit format")
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Change <h1> to <h3> - used in editorial blogs
|
# Change <h1> to <h3> - used in editorial blogs
|
||||||
masthead = soup.find("h1")
|
masthead = soup.find("h1")
|
||||||
@ -685,6 +1118,13 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
subhead.replaceWith(bTag)
|
subhead.replaceWith(bTag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
|
||||||
|
try:
|
||||||
|
#remove the <strong> update tag
|
||||||
|
blogupdated = soup.find('span', {'class':'update'})
|
||||||
|
if blogupdated:
|
||||||
|
blogupdated.replaceWith("")
|
||||||
|
except:
|
||||||
|
self.log("ERROR: Removing strong tag")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||||
@ -704,20 +1144,20 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
divTag.replaceWith(tag)
|
divTag.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||||
|
#print(strftime("%H:%M:%S")+" -- POSTPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
if not first:
|
||||||
|
return
|
||||||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||||||
if idxdiv is not None:
|
if idxdiv is not None:
|
||||||
if idxdiv.img:
|
if idxdiv.img:
|
||||||
self.add_toc_thumbnail(article, idxdiv.img['src'])
|
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src']))
|
||||||
else:
|
else:
|
||||||
img = soup.find('img')
|
img = soup.find('body').find('img')
|
||||||
if img is not None:
|
if img is not None:
|
||||||
self.add_toc_thumbnail(article, img['src'])
|
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src']))
|
||||||
|
|
||||||
shortparagraph = ""
|
shortparagraph = ""
|
||||||
try:
|
try:
|
||||||
if len(article.text_summary.strip()) == 0:
|
if len(article.text_summary.strip()) == 0:
|
||||||
@ -731,13 +1171,22 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||||
if len(refparagraph) > 0:
|
if len(refparagraph) > 0:
|
||||||
if len(refparagraph) > 70: #approximately one line of text
|
if len(refparagraph) > 70: #approximately one line of text
|
||||||
article.summary = article.text_summary = shortparagraph + refparagraph
|
newpara = shortparagraph + refparagraph
|
||||||
|
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||||
|
if newparaEm == '':
|
||||||
|
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||||
|
if newparaEm == '':
|
||||||
|
newparaDesc = newparaDateline
|
||||||
|
article.summary = article.text_summary = newparaDesc.strip()
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
shortparagraph = refparagraph + " "
|
shortparagraph = refparagraph + " "
|
||||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||||
shortparagraph = shortparagraph + "- "
|
shortparagraph = shortparagraph + "- "
|
||||||
|
else:
|
||||||
|
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||||||
except:
|
except:
|
||||||
self.log("Error creating article descriptions")
|
self.log("Error creating article descriptions")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,31 +6,51 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
nytimes.com
|
nytimes.com
|
||||||
'''
|
'''
|
||||||
import re, string, time
|
import re, string, time
|
||||||
from calibre import entity_to_unicode, strftime
|
from calibre import strftime
|
||||||
from datetime import timedelta, date
|
from datetime import timedelta, date
|
||||||
|
from time import sleep
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
|
recursions=1 # set this to zero to omit Related articles lists
|
||||||
|
match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
|
||||||
|
|
||||||
|
# set getTechBlogs to True to include the technology blogs
|
||||||
|
# set tech_oldest_article to control article age
|
||||||
|
# set tech_max_articles_per_feed to control article count
|
||||||
|
getTechBlogs = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
tech_oldest_article = 14
|
||||||
|
tech_max_articles_per_feed = 25
|
||||||
|
|
||||||
|
# set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
|
||||||
|
# otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
|
||||||
|
getPopularArticles = True
|
||||||
|
popularPeriod = '1' # set this to the number of days to include in the measurement
|
||||||
|
# e.g. 7 will get the most popular measured over the last 7 days
|
||||||
|
# and 30 will get the most popular measured over 30 days.
|
||||||
|
# you still only get up to 20 articles in each category
|
||||||
|
|
||||||
|
|
||||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||||
headlinesOnly = False
|
headlinesOnly = False
|
||||||
|
|
||||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
# set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the
|
||||||
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
# number of days old an article can be for inclusion. If oldest_web_article = None all articles
|
||||||
# will be included. Note: oldest_article is ignored if webEdition = False
|
# will be included. Note: oldest_web_article is ignored if webEdition = False
|
||||||
webEdition = False
|
webEdition = False
|
||||||
oldest_article = 7
|
oldest_web_article = None
|
||||||
|
|
||||||
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
|
||||||
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
|
||||||
replaceKindleVersion = False
|
|
||||||
|
|
||||||
# download higher resolution images than the small thumbnails typically included in the article
|
# download higher resolution images than the small thumbnails typically included in the article
|
||||||
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
|
||||||
useHighResImages = True
|
useHighResImages = True
|
||||||
|
|
||||||
|
# replace paid Kindle Version: the name will be changed to "The New York Times" to cause
|
||||||
|
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
|
||||||
|
replaceKindleVersion = False
|
||||||
|
|
||||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||||
# Otherwise, only the sections named will be included. For example,
|
# Otherwise, only the sections named will be included. For example,
|
||||||
#
|
#
|
||||||
@ -90,60 +110,68 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
('Education',u'education'),
|
('Education',u'education'),
|
||||||
('Multimedia',u'multimedia'),
|
('Multimedia',u'multimedia'),
|
||||||
(u'Obituaries',u'obituaries'),
|
(u'Obituaries',u'obituaries'),
|
||||||
(u'Sunday Magazine',u'magazine'),
|
(u'Sunday Magazine',u'magazine')
|
||||||
(u'Week in Review',u'weekinreview')]
|
]
|
||||||
|
|
||||||
|
tech_feeds = [
|
||||||
|
(u'Tech - Pogues Posts', u'http://pogue.blogs.nytimes.com/feed/'),
|
||||||
|
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
|
||||||
|
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
||||||
|
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
if headlinesOnly:
|
if headlinesOnly:
|
||||||
title='New York Times Headlines'
|
title='New York Times Headlines'
|
||||||
description = 'Headlines from the New York Times'
|
description = 'Headlines from the New York Times'
|
||||||
needs_subscription = True
|
needs_subscription = False
|
||||||
elif webEdition:
|
elif webEdition:
|
||||||
title='New York Times (Web)'
|
title='New York Times (Web)'
|
||||||
description = 'New York Times on the Web'
|
description = 'New York Times on the Web'
|
||||||
needs_subscription = True
|
needs_subscription = False
|
||||||
elif replaceKindleVersion:
|
elif replaceKindleVersion:
|
||||||
title='The New York Times'
|
title='The New York Times'
|
||||||
description = 'Today\'s New York Times'
|
description = 'Today\'s New York Times'
|
||||||
needs_subscription = True
|
needs_subscription = False
|
||||||
else:
|
else:
|
||||||
title='New York Times'
|
title='New York Times'
|
||||||
description = 'Today\'s New York Times. Needs subscription from http://www.nytimes.com'
|
description = 'Today\'s New York Times'
|
||||||
needs_subscription = True
|
needs_subscription = False
|
||||||
|
|
||||||
|
def decode_url_date(self,url):
|
||||||
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
urlitems = url.split('/')
|
||||||
|
|
||||||
def decode_us_date(self,datestr):
|
|
||||||
udate = datestr.strip().lower().split()
|
|
||||||
try:
|
try:
|
||||||
m = self.month_list.index(udate[0])+1
|
d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5]))
|
||||||
except:
|
except:
|
||||||
return date.today()
|
|
||||||
d = int(udate[1])
|
|
||||||
y = int(udate[2])
|
|
||||||
try:
|
try:
|
||||||
d = date(y,m,d)
|
d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6]))
|
||||||
except:
|
except:
|
||||||
d = date.today
|
return None
|
||||||
return d
|
return d
|
||||||
|
|
||||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
if oldest_web_article is None:
|
||||||
|
earliest_date = date.today()
|
||||||
|
else:
|
||||||
|
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||||
|
oldest_article = 365 # by default, a long time ago
|
||||||
|
|
||||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding/Ben Collier'
|
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
requires_version = (0, 7, 5)
|
requires_version = (0, 7, 5)
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
|
||||||
|
#simultaneous_downloads = 1 # no longer required to deal with ads
|
||||||
|
|
||||||
cover_margins = (18,18,'grey99')
|
cover_margins = (18,18,'grey99')
|
||||||
|
|
||||||
remove_tags_before = dict(id='article')
|
remove_tags_before = dict(id='article')
|
||||||
remove_tags_after = dict(id='article')
|
remove_tags_after = dict(id='article')
|
||||||
remove_tags = [dict(attrs={'class':[
|
remove_tags = [
|
||||||
|
dict(attrs={'class':[
|
||||||
'articleFooter',
|
'articleFooter',
|
||||||
'articleTools',
|
'articleTools',
|
||||||
'columnGroup doubleRule',
|
|
||||||
'columnGroup singleRule',
|
'columnGroup singleRule',
|
||||||
'columnGroup last',
|
'columnGroup last',
|
||||||
'columnGroup last',
|
'columnGroup last',
|
||||||
@ -151,7 +179,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'dottedLine',
|
'dottedLine',
|
||||||
'entry-meta',
|
'entry-meta',
|
||||||
'entry-response module',
|
'entry-response module',
|
||||||
#'icon enlargeThis', #removed to provide option for high res images
|
|
||||||
'leftNavTabs',
|
'leftNavTabs',
|
||||||
'metaFootnote',
|
'metaFootnote',
|
||||||
'module box nav',
|
'module box nav',
|
||||||
@ -175,12 +202,9 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'column four',#added for other blog downloads
|
'column four',#added for other blog downloads
|
||||||
'column four last',#added for other blog downloads
|
'column four last',#added for other blog downloads
|
||||||
'column last', #added for other blog downloads
|
'column last', #added for other blog downloads
|
||||||
'timestamp published', #added for other blog downloads
|
|
||||||
'entry entry-related',
|
'entry entry-related',
|
||||||
'subNavigation tabContent active', #caucus blog navigation
|
'subNavigation tabContent active', #caucus blog navigation
|
||||||
'columnGroup doubleRule',
|
|
||||||
'mediaOverlay slideshow',
|
'mediaOverlay slideshow',
|
||||||
'headlinesOnly multiline flush',
|
|
||||||
'wideThumb',
|
'wideThumb',
|
||||||
'video', #added 02-11-2011
|
'video', #added 02-11-2011
|
||||||
'videoHeader',#added 02-11-2011
|
'videoHeader',#added 02-11-2011
|
||||||
@ -189,7 +213,19 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
re.compile('^subNavigation'),
|
re.compile('^subNavigation'),
|
||||||
re.compile('^leaderboard'),
|
re.compile('^leaderboard'),
|
||||||
re.compile('^module'),
|
re.compile('^module'),
|
||||||
|
re.compile('commentCount'),
|
||||||
|
'credit'
|
||||||
]}),
|
]}),
|
||||||
|
dict(name='div', attrs={'class':re.compile('toolsList')}), # bits
|
||||||
|
dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits
|
||||||
|
dict(name='div', attrs={'class':'tweet'}),
|
||||||
|
dict(name='span', attrs={'class':'commentCount meta'}),
|
||||||
|
dict(name='div', attrs={'id':'header'}),
|
||||||
|
dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open
|
||||||
|
dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise
|
||||||
|
dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise
|
||||||
|
dict(name='div', attrs={'id':re.compile('respond')}), # open
|
||||||
|
dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
|
||||||
dict(id=[
|
dict(id=[
|
||||||
'adxLeaderboard',
|
'adxLeaderboard',
|
||||||
'adxSponLink',
|
'adxSponLink',
|
||||||
@ -227,17 +263,21 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
|
||||||
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
.timestamp { text-align: left; font-size: small; }
|
.timestamp { font-weight: normal; text-align: left; font-size: 50%; }
|
||||||
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
.caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
|
||||||
a:link {text-decoration: none; }
|
a:link {text-decoration: none; }
|
||||||
|
.date{font-size: 50%; }
|
||||||
|
.update{font-size: 50%; }
|
||||||
.articleBody { }
|
.articleBody { }
|
||||||
.authorId {text-align: left; }
|
.authorId {text-align: left; font-size: 50%; }
|
||||||
.image {text-align: center;}
|
.image {text-align: center;}
|
||||||
.source {text-align: left; }'''
|
.aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;}
|
||||||
|
.asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
|
||||||
|
.source {text-align: left; font-size: x-small; }'''
|
||||||
|
|
||||||
|
|
||||||
articles = {}
|
articles = {}
|
||||||
@ -261,11 +301,11 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
del ans[idx]
|
del ans[idx]
|
||||||
idx_max = idx_max-1
|
idx_max = idx_max-1
|
||||||
continue
|
continue
|
||||||
if self.verbose:
|
if True: #self.verbose
|
||||||
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
|
||||||
for article in ans[idx][1]:
|
for article in ans[idx][1]:
|
||||||
total_article_count += 1
|
total_article_count += 1
|
||||||
if self.verbose:
|
if True: #self.verbose
|
||||||
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
|
||||||
article['url'].encode('cp1252','replace')))
|
article['url'].encode('cp1252','replace')))
|
||||||
idx = idx+1
|
idx = idx+1
|
||||||
@ -276,7 +316,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
def exclude_url(self,url):
|
def exclude_url(self,url):
|
||||||
if not url.startswith("http"):
|
if not url.startswith("http"):
|
||||||
return True
|
return True
|
||||||
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url and 'blogs.nytimes.com' not in url: #added for DealBook
|
if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
|
||||||
return True
|
return True
|
||||||
if 'nytimes.com' not in url:
|
if 'nytimes.com' not in url:
|
||||||
return True
|
return True
|
||||||
@ -319,88 +359,78 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
if self.username is not None and self.password is not None:
|
|
||||||
br.open('http://www.nytimes.com/auth/login')
|
|
||||||
br.form = br.forms().next()
|
|
||||||
br['userid'] = self.username
|
|
||||||
br['password'] = self.password
|
|
||||||
raw = br.submit().read()
|
|
||||||
if 'Please try again' in raw:
|
|
||||||
raise Exception('Your username and password are incorrect')
|
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
cover_tag = 'NY_NYT'
|
||||||
# Skip ad pages served before actual article
|
|
||||||
skip_tag = soup.find(True, {'name':'skip'})
|
|
||||||
if skip_tag is not None:
|
|
||||||
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
|
|
||||||
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
|
||||||
url += '?pagewanted=all'
|
|
||||||
self.log.warn("Skipping ad to article at '%s'" % url)
|
|
||||||
return self.index_to_soup(url, raw=True)
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover = None
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
|
||||||
st = time.localtime()
|
br = BasicNewsRecipe.get_browser()
|
||||||
year = str(st.tm_year)
|
daysback=1
|
||||||
month = "%.2d" % st.tm_mon
|
try:
|
||||||
day = "%.2d" % st.tm_mday
|
br.open(cover)
|
||||||
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg'
|
except:
|
||||||
|
while daysback<7:
|
||||||
|
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg'
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
try:
|
try:
|
||||||
br.open(cover)
|
br.open(cover)
|
||||||
except:
|
except:
|
||||||
|
daysback = daysback+1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
if daysback==7:
|
||||||
self.log("\nCover unavailable")
|
self.log("\nCover unavailable")
|
||||||
cover = None
|
cover = None
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
|
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||||
|
|
||||||
|
|
||||||
def short_title(self):
|
def short_title(self):
|
||||||
return self.title
|
return self.title
|
||||||
|
|
||||||
def index_to_soup(self, url_or_raw, raw=False):
|
|
||||||
'''
|
def article_to_soup(self, url_or_raw, raw=False):
|
||||||
OVERRIDE of class method
|
from contextlib import closing
|
||||||
deals with various page encodings between index and articles
|
import copy
|
||||||
'''
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
def get_the_soup(docEncoding, url_or_raw, raw=False) :
|
print("ARTICLE_TO_SOUP "+url_or_raw)
|
||||||
if re.match(r'\w+://', url_or_raw):
|
if re.match(r'\w+://', url_or_raw):
|
||||||
br = self.clone_browser(self.browser)
|
br = self.clone_browser(self.browser)
|
||||||
f = br.open_novisit(url_or_raw)
|
open_func = getattr(br, 'open_novisit', br.open)
|
||||||
|
with closing(open_func(url_or_raw)) as f:
|
||||||
_raw = f.read()
|
_raw = f.read()
|
||||||
f.close()
|
|
||||||
if not _raw:
|
if not _raw:
|
||||||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||||
else:
|
else:
|
||||||
_raw = url_or_raw
|
_raw = url_or_raw
|
||||||
if raw:
|
if raw:
|
||||||
return _raw
|
return _raw
|
||||||
|
|
||||||
if not isinstance(_raw, unicode) and self.encoding:
|
if not isinstance(_raw, unicode) and self.encoding:
|
||||||
_raw = _raw.decode(docEncoding, 'replace')
|
if callable(self.encoding):
|
||||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
_raw = self.encoding(_raw)
|
||||||
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
else:
|
||||||
return BeautifulSoup(_raw, markupMassage=massage)
|
_raw = _raw.decode(self.encoding, 'replace')
|
||||||
|
|
||||||
# Entry point
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
soup = get_the_soup( self.encoding, url_or_raw )
|
nmassage.extend(self.preprocess_regexps)
|
||||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')]
|
||||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
# Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||||
if docEncoding == '' :
|
# Remove comments as they can leave detritus when extracting tags leaves
|
||||||
docEncoding = self.encoding
|
# multiple nested comments
|
||||||
|
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||||
|
usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0]
|
||||||
|
usrc = self.preprocess_raw_html(usrc, url_or_raw)
|
||||||
|
return BeautifulSoup(usrc, markupMassage=nmassage)
|
||||||
|
|
||||||
if self.verbose > 2:
|
|
||||||
self.log( " document encoding: '%s'" % docEncoding)
|
|
||||||
if docEncoding != self.encoding :
|
|
||||||
soup = get_the_soup(docEncoding, url_or_raw)
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def massageNCXText(self, description):
|
def massageNCXText(self, description):
|
||||||
# Kindle TOC descriptions won't render certain characters
|
# Kindle TOC descriptions won't render certain characters
|
||||||
if description:
|
if description:
|
||||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||||
# Replace '&' with '&'
|
# Replace '&' with '&'
|
||||||
massaged = re.sub("&","&", massaged)
|
massaged = re.sub("&","&", massaged)
|
||||||
|
massaged = re.sub("&","&", massaged)
|
||||||
return self.fixChars(massaged)
|
return self.fixChars(massaged)
|
||||||
else:
|
else:
|
||||||
return description
|
return description
|
||||||
@ -422,6 +452,16 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if self.filterDuplicates:
|
if self.filterDuplicates:
|
||||||
if url in self.url_list:
|
if url in self.url_list:
|
||||||
return
|
return
|
||||||
|
if self.webEdition:
|
||||||
|
date_tag = self.decode_url_date(url)
|
||||||
|
if date_tag is not None:
|
||||||
|
if self.oldest_web_article is not None:
|
||||||
|
if date_tag < self.earliest_date:
|
||||||
|
self.log("Skipping article %s" % url)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
self.log("Skipping article %s" % url)
|
||||||
|
return
|
||||||
self.url_list.append(url)
|
self.url_list.append(url)
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
description = ''
|
description = ''
|
||||||
@ -446,6 +486,92 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
description=description, author=author,
|
description=description, author=author,
|
||||||
content=''))
|
content=''))
|
||||||
|
|
||||||
|
def get_popular_articles(self,ans):
|
||||||
|
if self.getPopularArticles:
|
||||||
|
popular_articles = {}
|
||||||
|
key_list = []
|
||||||
|
|
||||||
|
def handleh3(h3tag):
|
||||||
|
try:
|
||||||
|
url = h3tag.a['href']
|
||||||
|
except:
|
||||||
|
return ('','','','')
|
||||||
|
url = re.sub(r'\?.*', '', url)
|
||||||
|
if self.exclude_url(url):
|
||||||
|
return ('','','','')
|
||||||
|
url += '?pagewanted=all'
|
||||||
|
title = self.tag_to_string(h3tag.a,False)
|
||||||
|
h6tag = h3tag.findNextSibling('h6')
|
||||||
|
if h6tag is not None:
|
||||||
|
author = self.tag_to_string(h6tag,False)
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
ptag = h3tag.findNextSibling('p')
|
||||||
|
if ptag is not None:
|
||||||
|
desc = self.tag_to_string(ptag,False)
|
||||||
|
else:
|
||||||
|
desc = ''
|
||||||
|
return(title,url,author,desc)
|
||||||
|
|
||||||
|
|
||||||
|
have_emailed = False
|
||||||
|
emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
|
||||||
|
for h3tag in emailed_soup.findAll('h3'):
|
||||||
|
(title,url,author,desc) = handleh3(h3tag)
|
||||||
|
if url=='':
|
||||||
|
continue
|
||||||
|
if not have_emailed:
|
||||||
|
key_list.append('Most E-Mailed')
|
||||||
|
popular_articles['Most E-Mailed'] = []
|
||||||
|
have_emailed = True
|
||||||
|
popular_articles['Most E-Mailed'].append(
|
||||||
|
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
||||||
|
description=desc, author=author,
|
||||||
|
content=''))
|
||||||
|
have_viewed = False
|
||||||
|
viewed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-viewed?period='+self.popularPeriod)
|
||||||
|
for h3tag in viewed_soup.findAll('h3'):
|
||||||
|
(title,url,author,desc) = handleh3(h3tag)
|
||||||
|
if url=='':
|
||||||
|
continue
|
||||||
|
if not have_viewed:
|
||||||
|
key_list.append('Most Viewed')
|
||||||
|
popular_articles['Most Viewed'] = []
|
||||||
|
have_viewed = True
|
||||||
|
popular_articles['Most Viewed'].append(
|
||||||
|
dict(title=title, url=url, date=strftime('%a, %d %b'),
|
||||||
|
description=desc, author=author,
|
||||||
|
content=''))
|
||||||
|
viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
|
||||||
|
for x in viewed_ans:
|
||||||
|
ans.append(x)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def get_tech_feeds(self,ans):
|
||||||
|
if self.getTechBlogs:
|
||||||
|
tech_articles = {}
|
||||||
|
key_list = []
|
||||||
|
save_oldest_article = self.oldest_article
|
||||||
|
save_max_articles_per_feed = self.max_articles_per_feed
|
||||||
|
self.oldest_article = self.tech_oldest_article
|
||||||
|
self.max_articles_per_feed = self.tech_max_articles_per_feed
|
||||||
|
self.feeds = self.tech_feeds
|
||||||
|
tech = self.parse_feeds()
|
||||||
|
self.oldest_article = save_oldest_article
|
||||||
|
self.max_articles_per_feed = save_max_articles_per_feed
|
||||||
|
self.feeds = None
|
||||||
|
for f in tech:
|
||||||
|
key_list.append(f.title)
|
||||||
|
tech_articles[f.title] = []
|
||||||
|
for a in f.articles:
|
||||||
|
tech_articles[f.title].append(
|
||||||
|
dict(title=a.title, url=a.url, date=a.date,
|
||||||
|
description=a.summary, author=a.author,
|
||||||
|
content=a.content))
|
||||||
|
tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
|
||||||
|
for x in tech_ans:
|
||||||
|
ans.append(x)
|
||||||
|
return ans
|
||||||
|
|
||||||
def parse_web_edition(self):
|
def parse_web_edition(self):
|
||||||
|
|
||||||
@ -457,31 +583,41 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if sec_title in self.excludeSections:
|
if sec_title in self.excludeSections:
|
||||||
print "SECTION EXCLUDED: ",sec_title
|
print "SECTION EXCLUDED: ",sec_title
|
||||||
continue
|
continue
|
||||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
try:
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||||
|
|
||||||
self.key = sec_title
|
self.key = sec_title
|
||||||
# Find each article
|
# Find each article
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
if div['class'] in ['story', 'story headline'] :
|
if div['class'] in ['story', 'story headline', 'storyHeader'] :
|
||||||
self.handle_article(div)
|
self.handle_article(div)
|
||||||
|
elif div['class'] == 'ledeStory':
|
||||||
|
divsub = div.find('div','storyHeader')
|
||||||
|
if divsub is not None:
|
||||||
|
self.handle_article(divsub)
|
||||||
|
ulrefer = div.find('ul','refer')
|
||||||
|
if ulrefer is not None:
|
||||||
|
for lidiv in ulrefer.findAll('li'):
|
||||||
|
self.handle_article(lidiv)
|
||||||
elif div['class'] == 'headlinesOnly multiline flush':
|
elif div['class'] == 'headlinesOnly multiline flush':
|
||||||
for lidiv in div.findAll('li'):
|
for lidiv in div.findAll('li'):
|
||||||
self.handle_article(lidiv)
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
|
|
||||||
def parse_todays_index(self):
|
def parse_todays_index(self):
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
|
|
||||||
skipping = False
|
skipping = False
|
||||||
# Find each article
|
# Find each article
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
|
|
||||||
if div['class'] in ['section-headline','sectionHeader']:
|
if div['class'] in ['section-headline','sectionHeader']:
|
||||||
self.key = string.capwords(self.feed_title(div))
|
self.key = string.capwords(self.feed_title(div))
|
||||||
self.key = self.key.replace('Op-ed','Op-Ed')
|
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||||
@ -505,7 +641,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.handle_article(lidiv)
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
def parse_headline_index(self):
|
def parse_headline_index(self):
|
||||||
|
|
||||||
@ -553,7 +689,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for h3_item in search_div.findAll('h3'):
|
for h3_item in search_div.findAll('h3'):
|
||||||
byline = h3_item.h6
|
byline = h3_item.h6
|
||||||
if byline is not None:
|
if byline is not None:
|
||||||
author = self.tag_to_string(byline,usa_alt=False)
|
author = self.tag_to_string(byline,use_alt=False)
|
||||||
else:
|
else:
|
||||||
author = ''
|
author = ''
|
||||||
a = h3_item.find('a', href=True)
|
a = h3_item.find('a', href=True)
|
||||||
@ -579,7 +715,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
if self.headlinesOnly:
|
if self.headlinesOnly:
|
||||||
@ -589,40 +725,199 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
return self.parse_todays_index()
|
return self.parse_todays_index()
|
||||||
|
|
||||||
def strip_anchors(self,soup):
|
def strip_anchors(self,soup,kill_all=False):
|
||||||
paras = soup.findAll(True)
|
paras = soup.findAll(True)
|
||||||
for para in paras:
|
for para in paras:
|
||||||
aTags = para.findAll('a')
|
aTags = para.findAll('a')
|
||||||
for a in aTags:
|
for a in aTags:
|
||||||
if a.img is None:
|
if a.img is None:
|
||||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
if kill_all or (self.recursions==0):
|
||||||
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
|
else:
|
||||||
|
if a.has_key('href'):
|
||||||
|
if a['href'].startswith('http://www.nytimes'):
|
||||||
|
if not a['href'].endswith('pagewanted=all'):
|
||||||
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
|
if self.exclude_url(url):
|
||||||
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
|
else:
|
||||||
|
a['href'] = url+'?pagewanted=all'
|
||||||
|
elif not (a['href'].startswith('http://pogue') or \
|
||||||
|
a['href'].startswith('http://bits') or \
|
||||||
|
a['href'].startswith('http://travel') or \
|
||||||
|
a['href'].startswith('http://business') or \
|
||||||
|
a['href'].startswith('http://tech') or \
|
||||||
|
a['href'].startswith('http://health') or \
|
||||||
|
a['href'].startswith('http://dealbook') or \
|
||||||
|
a['href'].startswith('http://open')):
|
||||||
|
a.replaceWith(self.tag_to_string(a,False))
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def handle_tags(self,soup):
|
||||||
|
try:
|
||||||
|
print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title))
|
||||||
|
except:
|
||||||
|
print("HANDLE TAGS: NO TITLE")
|
||||||
|
if soup is None:
|
||||||
|
print("ERROR: handle_tags received NoneType")
|
||||||
|
return None
|
||||||
|
|
||||||
|
## print("HANDLING AD FORWARD:")
|
||||||
|
## print(soup)
|
||||||
|
if self.keep_only_tags:
|
||||||
|
body = Tag(soup, 'body')
|
||||||
|
try:
|
||||||
|
if isinstance(self.keep_only_tags, dict):
|
||||||
|
self.keep_only_tags = [self.keep_only_tags]
|
||||||
|
for spec in self.keep_only_tags:
|
||||||
|
for tag in soup.find('body').findAll(**spec):
|
||||||
|
body.insert(len(body.contents), tag)
|
||||||
|
soup.find('body').replaceWith(body)
|
||||||
|
except AttributeError: # soup has no body element
|
||||||
|
pass
|
||||||
|
|
||||||
|
def remove_beyond(tag, next):
|
||||||
|
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||||
|
after = getattr(tag, next)
|
||||||
|
while after is not None:
|
||||||
|
ns = getattr(tag, next)
|
||||||
|
after.extract()
|
||||||
|
after = ns
|
||||||
|
tag = tag.parent
|
||||||
|
|
||||||
|
if self.remove_tags_after is not None:
|
||||||
|
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||||||
|
for spec in rt:
|
||||||
|
tag = soup.find(**spec)
|
||||||
|
remove_beyond(tag, 'nextSibling')
|
||||||
|
|
||||||
|
if self.remove_tags_before is not None:
|
||||||
|
tag = soup.find(**self.remove_tags_before)
|
||||||
|
remove_beyond(tag, 'previousSibling')
|
||||||
|
|
||||||
|
for kwds in self.remove_tags:
|
||||||
|
for tag in soup.findAll(**kwds):
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
if self.webEdition & (self.oldest_article>0):
|
#print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
skip_tag = soup.find(True, {'name':'skip'})
|
||||||
if date_tag:
|
if skip_tag is not None:
|
||||||
date_str = self.tag_to_string(date_tag,use_alt=False)
|
#url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
|
||||||
date_str = date_str.replace('Published:','')
|
url = 'http://www.nytimes.com' + skip_tag.parent['href']
|
||||||
date_items = date_str.split(',')
|
#url += '?pagewanted=all'
|
||||||
try:
|
self.log.warn("Skipping ad to article at '%s'" % url)
|
||||||
datestring = date_items[0]+' '+date_items[1]
|
sleep(5)
|
||||||
article_date = self.decode_us_date(datestring)
|
soup = self.handle_tags(self.article_to_soup(url))
|
||||||
except:
|
|
||||||
article_date = date.today()
|
|
||||||
if article_date < self.earliest_date:
|
|
||||||
self.log("Skipping article dated %s" % date_str)
|
|
||||||
return None
|
|
||||||
|
|
||||||
#all articles are from today, no need to print the date on every page
|
# check if the article is from one of the tech blogs
|
||||||
try:
|
blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']})
|
||||||
if not self.webEdition:
|
|
||||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
if blog is not None:
|
||||||
if date_tag:
|
old_body = soup.find('body')
|
||||||
date_tag.extract()
|
new_body=Tag(soup,'body')
|
||||||
except:
|
new_body.append(soup.find('div',attrs={'id':'content'}))
|
||||||
self.log("Error removing the published date")
|
new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
|
||||||
|
old_body.replaceWith(new_body)
|
||||||
|
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
|
||||||
|
if divr.find(text=re.compile('Sign up')):
|
||||||
|
divr.extract()
|
||||||
|
divr = soup.find('div',attrs={'id':re.compile('related-content')})
|
||||||
|
if divr is not None:
|
||||||
|
# handle related articles
|
||||||
|
rlist = []
|
||||||
|
ul = divr.find('ul')
|
||||||
|
if ul is not None:
|
||||||
|
for li in ul.findAll('li'):
|
||||||
|
atag = li.find('a')
|
||||||
|
if atag is not None:
|
||||||
|
if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \
|
||||||
|
atag['href'].startswith('http://open'):
|
||||||
|
atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False)))
|
||||||
|
rlist.append(atag)
|
||||||
|
divr.extract()
|
||||||
|
if rlist != []:
|
||||||
|
asidediv = Tag(soup,'div',[('class','aside')])
|
||||||
|
if soup.find('hr') is None:
|
||||||
|
asidediv.append(Tag(soup,'hr'))
|
||||||
|
h4 = Tag(soup,'h4',[('class','asidenote')])
|
||||||
|
h4.insert(0,"Related Posts")
|
||||||
|
asidediv.append(h4)
|
||||||
|
ul = Tag(soup,'ul')
|
||||||
|
for r in rlist:
|
||||||
|
li = Tag(soup,'li',[('class','aside')])
|
||||||
|
r['class'] = 'aside'
|
||||||
|
li.append(r)
|
||||||
|
ul.append(li)
|
||||||
|
asidediv.append(ul)
|
||||||
|
asidediv.append(Tag(soup,'hr'))
|
||||||
|
smain = soup.find('body')
|
||||||
|
smain.append(asidediv)
|
||||||
|
for atag in soup.findAll('a'):
|
||||||
|
img = atag.find('img')
|
||||||
|
if img is not None:
|
||||||
|
atag.replaceWith(img)
|
||||||
|
elif not atag.has_key('href'):
|
||||||
|
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||||
|
elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
|
||||||
|
atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
|
||||||
|
atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
|
||||||
|
hdr = soup.find('address')
|
||||||
|
if hdr is not None:
|
||||||
|
hdr.name='span'
|
||||||
|
for span_credit in soup.findAll('span','credit'):
|
||||||
|
sp = Tag(soup,'span')
|
||||||
|
span_credit.replaceWith(sp)
|
||||||
|
sp.append(Tag(soup,'br'))
|
||||||
|
sp.append(span_credit)
|
||||||
|
sp.append(Tag(soup,'br'))
|
||||||
|
|
||||||
|
else: # nytimes article
|
||||||
|
|
||||||
|
related = [] # these will be the related articles
|
||||||
|
first_outer = None # first related outer tag
|
||||||
|
first_related = None # first related tag
|
||||||
|
for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||||
|
for rdiv in soup.findAll('div','columnGroup doubleRule'):
|
||||||
|
if rdiv.find('h3') is not None:
|
||||||
|
if self.tag_to_string(rdiv.h3,False).startswith('Related'):
|
||||||
|
rdiv.h3.find(text=True).replaceWith("Related articles")
|
||||||
|
rdiv.h3['class'] = 'asidenote'
|
||||||
|
for litag in rdiv.findAll('li'):
|
||||||
|
if litag.find('a') is not None:
|
||||||
|
if litag.find('a')['href'].startswith('http://www.nytimes.com'):
|
||||||
|
url = re.sub(r'\?.*', '', litag.find('a')['href'])
|
||||||
|
litag.find('a')['href'] = url+'?pagewanted=all'
|
||||||
|
litag.extract()
|
||||||
|
related.append(litag)
|
||||||
|
if first_related is None:
|
||||||
|
first_related = rdiv
|
||||||
|
first_outer = outerdiv
|
||||||
|
else:
|
||||||
|
litag.extract()
|
||||||
|
if related != []:
|
||||||
|
for r in related:
|
||||||
|
if r.h6: # don't want the anchor inside a h6 tag
|
||||||
|
r.h6.replaceWith(r.h6.a)
|
||||||
|
first_related.ul.append(r)
|
||||||
|
first_related.insert(0,Tag(soup,'hr'))
|
||||||
|
first_related.append(Tag(soup,'hr'))
|
||||||
|
first_related['class'] = 'aside'
|
||||||
|
first_outer.replaceWith(first_related) # replace the outer tag with the related tag
|
||||||
|
|
||||||
|
for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
|
||||||
|
rdiv.extract()
|
||||||
|
|
||||||
|
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||||
|
if kicker_tag: # remove Op_Ed author head shots
|
||||||
|
tagline = self.tag_to_string(kicker_tag)
|
||||||
|
if tagline=='Op-Ed Columnist':
|
||||||
|
img_div = soup.find('div','inlineImage module')
|
||||||
|
if img_div:
|
||||||
|
img_div.extract()
|
||||||
|
|
||||||
if self.useHighResImages:
|
if self.useHighResImages:
|
||||||
try:
|
try:
|
||||||
@ -667,26 +962,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except Exception:
|
except Exception:
|
||||||
self.log("Error pulling high resolution images")
|
self.log("Error pulling high resolution images")
|
||||||
|
|
||||||
try:
|
|
||||||
#remove "Related content" bar
|
|
||||||
runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft ','articleInline runaroundLeft lastArticleInline']})
|
|
||||||
if runAroundsFound:
|
|
||||||
for runAround in runAroundsFound:
|
|
||||||
#find all section headers
|
|
||||||
hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']})
|
|
||||||
if hlines:
|
|
||||||
for hline in hlines:
|
|
||||||
hline.extract()
|
|
||||||
|
|
||||||
#find all section headers
|
|
||||||
hlines = runAround.findAll('h6')
|
|
||||||
if hlines:
|
|
||||||
for hline in hlines:
|
|
||||||
hline.extract()
|
|
||||||
except:
|
|
||||||
self.log("Error removing related content bar")
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#in case pulling images failed, delete the enlarge this text
|
#in case pulling images failed, delete the enlarge this text
|
||||||
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
|
||||||
@ -696,9 +971,25 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
self.log("Error removing Enlarge this text")
|
self.log("Error removing Enlarge this text")
|
||||||
|
|
||||||
return self.strip_anchors(soup)
|
|
||||||
|
|
||||||
def postprocess_html(self,soup, True):
|
return self.strip_anchors(soup,False)
|
||||||
|
|
||||||
|
def postprocess_html(self,soup,first_fetch):
|
||||||
|
if not first_fetch: # remove Related links
|
||||||
|
for aside in soup.findAll('div','aside'):
|
||||||
|
aside.extract()
|
||||||
|
soup = self.strip_anchors(soup,True)
|
||||||
|
#print("RECURSIVE: "+self.tag_to_string(soup.title))
|
||||||
|
|
||||||
|
if soup.find('div',attrs={'id':'blogcontent'}) is None:
|
||||||
|
if first_fetch:
|
||||||
|
aside = soup.find('div','aside')
|
||||||
|
if aside is not None: # move the related list to the end of the article
|
||||||
|
art = soup.find('div',attrs={'id':'article'})
|
||||||
|
if art is None:
|
||||||
|
art = soup.find('div',attrs={'class':'article'})
|
||||||
|
if art is not None:
|
||||||
|
art.append(aside)
|
||||||
try:
|
try:
|
||||||
if self.one_picture_per_article:
|
if self.one_picture_per_article:
|
||||||
# Remove all images after first
|
# Remove all images after first
|
||||||
@ -774,7 +1065,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if headline:
|
if headline:
|
||||||
tag = Tag(soup, "h2")
|
tag = Tag(soup, "h2")
|
||||||
tag['class'] = "headline"
|
tag['class'] = "headline"
|
||||||
tag.insert(0, self.fixChars(headline.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(headline,False)))
|
||||||
soup.insert(0, tag)
|
soup.insert(0, tag)
|
||||||
hrs = soup.findAll('hr')
|
hrs = soup.findAll('hr')
|
||||||
for hr in hrs:
|
for hr in hrs:
|
||||||
@ -788,7 +1079,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if bylineauthor:
|
if bylineauthor:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
tag['class'] = "byline"
|
tag['class'] = "byline"
|
||||||
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False)))
|
||||||
bylineauthor.replaceWith(tag)
|
bylineauthor.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: fixing byline author format")
|
self.log("ERROR: fixing byline author format")
|
||||||
@ -799,7 +1090,7 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if blogcredit:
|
if blogcredit:
|
||||||
tag = Tag(soup, "h6")
|
tag = Tag(soup, "h6")
|
||||||
tag['class'] = "credit"
|
tag['class'] = "credit"
|
||||||
tag.insert(0, self.fixChars(blogcredit.renderContents()))
|
tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False)))
|
||||||
blogcredit.replaceWith(tag)
|
blogcredit.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: fixing credit format")
|
self.log("ERROR: fixing credit format")
|
||||||
@ -853,25 +1144,24 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
divTag.replaceWith(tag)
|
divTag.replaceWith(tag)
|
||||||
except:
|
except:
|
||||||
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
|
||||||
|
#print(strftime("%H:%M:%S")+" -- POSTPROCESS TITLE="+self.tag_to_string(soup.title))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
if not first:
|
||||||
|
return
|
||||||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||||||
if idxdiv is not None:
|
if idxdiv is not None:
|
||||||
if idxdiv.img:
|
if idxdiv.img:
|
||||||
self.add_toc_thumbnail(article, idxdiv.img['src'])
|
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src']))
|
||||||
else:
|
else:
|
||||||
img = soup.find('img')
|
img = soup.find('body').find('img')
|
||||||
if img is not None:
|
if img is not None:
|
||||||
self.add_toc_thumbnail(article, img['src'])
|
self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src']))
|
||||||
|
|
||||||
shortparagraph = ""
|
shortparagraph = ""
|
||||||
try:
|
try:
|
||||||
if len(article.text_summary.strip()) == 0:
|
if len(article.text_summary.strip()) == 0:
|
||||||
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
|
||||||
if not articlebodies: #added to account for blog formats
|
|
||||||
articlebodies = soup.findAll('div', attrs={'class':'entry-content'}) #added to account for blog formats
|
|
||||||
if articlebodies:
|
if articlebodies:
|
||||||
for articlebody in articlebodies:
|
for articlebody in articlebodies:
|
||||||
if articlebody:
|
if articlebody:
|
||||||
@ -880,15 +1170,23 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
|
||||||
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
#account for blank paragraphs and short paragraphs by appending them to longer ones
|
||||||
if len(refparagraph) > 0:
|
if len(refparagraph) > 0:
|
||||||
if len(refparagraph) > 140: #approximately two lines of text
|
if len(refparagraph) > 70: #approximately one line of text
|
||||||
article.summary = article.text_summary = shortparagraph + refparagraph
|
newpara = shortparagraph + refparagraph
|
||||||
|
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||||
|
if newparaEm == '':
|
||||||
|
newparaDateline,newparaEm,newparaDesc = newpara.partition('—')
|
||||||
|
if newparaEm == '':
|
||||||
|
newparaDesc = newparaDateline
|
||||||
|
article.summary = article.text_summary = newparaDesc.strip()
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
shortparagraph = refparagraph + " "
|
shortparagraph = refparagraph + " "
|
||||||
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
|
||||||
shortparagraph = shortparagraph + "- "
|
shortparagraph = shortparagraph + "- "
|
||||||
|
else:
|
||||||
|
article.summary = article.text_summary = self.massageNCXText(article.text_summary)
|
||||||
except:
|
except:
|
||||||
self.log("Error creating article descriptions")
|
self.log("Error creating article descriptions")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
65
recipes/outside_magazine.recipe
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Outside Magazine'
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
description = 'Outside Magazine - Free 1 Month Old Issue'
|
||||||
|
timefmt = ' [%d %b, %Y]'
|
||||||
|
needs_subscription = False
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
#auto_cleanup = True
|
||||||
|
#auto_cleanup_keep = '//div[@class="thumbnail"]'
|
||||||
|
|
||||||
|
keep_only_tags = dict(name='div', attrs={'class':'masonry-box width-four'})
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':['share-bar', 'outbrain_widget_0', 'outbrain_widget_1', 'livefyre']}),
|
||||||
|
#dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
|
||||||
|
#dict(name='form', attrs={'onsubmit':''}),
|
||||||
|
dict(name='section', attrs={'id':['article-quote', 'article-navigation']}),
|
||||||
|
]
|
||||||
|
#TO GET ARTICLE TOC
|
||||||
|
def out_get_index(self):
|
||||||
|
super_url = 'http://www.outsideonline.com/magazine/'
|
||||||
|
super_soup = self.index_to_soup(super_url)
|
||||||
|
div = super_soup.find(attrs={'class':'masonry-box width-four'})
|
||||||
|
issue = div.findAll(name='article')[1]
|
||||||
|
super_a = issue.find('a', href=True)
|
||||||
|
return super_a.get('href')
|
||||||
|
|
||||||
|
|
||||||
|
# To parse artice toc
|
||||||
|
def parse_index(self):
|
||||||
|
parse_soup = self.index_to_soup(self.out_get_index())
|
||||||
|
|
||||||
|
feeds = []
|
||||||
|
feed_title = 'Articles'
|
||||||
|
|
||||||
|
articles = []
|
||||||
|
self.log('Found section:', feed_title)
|
||||||
|
div = parse_soup.find(attrs={'class':'print clearfix'})
|
||||||
|
for art in div.findAll(name='p'):
|
||||||
|
art_info = art.find(name = 'a')
|
||||||
|
if art_info is None:
|
||||||
|
continue
|
||||||
|
art_title = self.tag_to_string(art_info)
|
||||||
|
url = art_info.get('href') + '?page=all'
|
||||||
|
self.log.info('\tFound article:', art_title, 'at', url)
|
||||||
|
article = {'title':art_title, 'url':url, 'date':''}
|
||||||
|
#au = art.find(attrs={'class':'articleAuthors'})
|
||||||
|
#if au is not None:
|
||||||
|
#article['author'] = self.tag_to_string(au)
|
||||||
|
#desc = art.find(attrs={'class':'hover_text'})
|
||||||
|
#if desc is not None:
|
||||||
|
#desc = self.tag_to_string(desc)
|
||||||
|
#if 'author' in article:
|
||||||
|
#desc = ' by ' + article['author'] + ' ' +desc
|
||||||
|
#article['description'] = desc
|
||||||
|
articles.append(article)
|
||||||
|
if articles:
|
||||||
|
feeds.append((feed_title, articles))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
22
recipes/oxford_mail.recipe
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class HindustanTimes(BasicNewsRecipe):
|
||||||
|
title = u'Oxford Mail'
|
||||||
|
language = 'en_GB'
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
oldest_article = 1 #days
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
#encoding = 'cp1252'
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('News',
|
||||||
|
'http://www.oxfordmail.co.uk/news/rss/'),
|
||||||
|
('Sports',
|
||||||
|
'http://www.oxfordmail.co.uk/sport/rss/'),
|
||||||
|
]
|
||||||
|
|
@ -1,27 +1,27 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
||||||
|
|
||||||
class PajamasMedia(BasicNewsRecipe):
|
class PajamasMedia(BasicNewsRecipe):
|
||||||
title = u'Pajamas Media'
|
title = u'Pajamas Media'
|
||||||
description = u'Provides exclusive news and opinion for forty countries.'
|
description = u'Provides exclusive news and opinion for forty countries.'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
__author__ = 'Krittika Goyal'
|
__author__ = 'Krittika Goyal'
|
||||||
oldest_article = 1 #days
|
oldest_article = 2 #days
|
||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
recursions = 1
|
recursions = 1
|
||||||
match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$']
|
match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$']
|
||||||
#encoding = 'latin1'
|
#encoding = 'latin1'
|
||||||
|
|
||||||
remove_stylesheets = True
|
remove_stylesheets = True
|
||||||
#remove_tags_before = dict(name='h1', attrs={'class':'heading'})
|
auto_cleanup = True
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'paged-nav'})
|
##remove_tags_before = dict(name='h1', attrs={'class':'heading'})
|
||||||
remove_tags = [
|
#remove_tags_after = dict(name='div', attrs={'class':'paged-nav'})
|
||||||
dict(name='iframe'),
|
#remove_tags = [
|
||||||
dict(name='div', attrs={'class':['pages']}),
|
#dict(name='iframe'),
|
||||||
#dict(name='div', attrs={'id':['bookmark']}),
|
#dict(name='div', attrs={'class':['pages']}),
|
||||||
#dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
|
##dict(name='div', attrs={'id':['bookmark']}),
|
||||||
#dict(name='ul', attrs={'class':'articleTools'}),
|
##dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
|
||||||
]
|
##dict(name='ul', attrs={'class':'articleTools'}),
|
||||||
|
#]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('pajamas Media',
|
('pajamas Media',
|
||||||
@ -29,20 +29,20 @@ class PajamasMedia(BasicNewsRecipe):
|
|||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
#def preprocess_html(self, soup):
|
||||||
story = soup.find(name='div', attrs={'id':'innerpage-content'})
|
#story = soup.find(name='div', attrs={'id':'innerpage-content'})
|
||||||
#td = heading.findParent(name='td')
|
##td = heading.findParent(name='td')
|
||||||
#td.extract()
|
##td.extract()
|
||||||
|
|
||||||
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
#soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
||||||
body = soup.find(name='body')
|
#body = soup.find(name='body')
|
||||||
body.insert(0, story)
|
#body.insert(0, story)
|
||||||
return soup
|
#return soup
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
#def postprocess_html(self, soup, first):
|
||||||
if not first:
|
#if not first:
|
||||||
h = soup.find(attrs={'class':'innerpage-header'})
|
#h = soup.find(attrs={'class':'innerpage-header'})
|
||||||
if h: h.extract()
|
#if h: h.extract()
|
||||||
auth = soup.find(attrs={'class':'author'})
|
#auth = soup.find(attrs={'class':'author'})
|
||||||
if auth: auth.extract()
|
#if auth: auth.extract()
|
||||||
return soup
|
#return soup
|
||||||
|
@ -6,7 +6,6 @@ class PhilosophyNow(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'Philosophy Now'
|
title = 'Philosophy Now'
|
||||||
__author__ = 'Rick Shang'
|
__author__ = 'Rick Shang'
|
||||||
|
|
||||||
description = '''Philosophy Now is a lively magazine for everyone
|
description = '''Philosophy Now is a lively magazine for everyone
|
||||||
interested in ideas. It isn't afraid to tackle all the major questions of
|
interested in ideas. It isn't afraid to tackle all the major questions of
|
||||||
life, the universe and everything. Published every two months, it tries to
|
life, the universe and everything. Published every two months, it tries to
|
||||||
@ -27,7 +26,7 @@ class PhilosophyNow(BasicNewsRecipe):
|
|||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
br.open('https://philosophynow.org/auth/login')
|
br.open('https://philosophynow.org/auth/login')
|
||||||
br.select_form(nr = 1)
|
br.select_form(name="loginForm")
|
||||||
br['username'] = self.username
|
br['username'] = self.username
|
||||||
br['password'] = self.password
|
br['password'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
@ -50,19 +49,20 @@ class PhilosophyNow(BasicNewsRecipe):
|
|||||||
#Go to the main body
|
#Go to the main body
|
||||||
current_issue_url = 'http://philosophynow.org/issues/' + issuenum
|
current_issue_url = 'http://philosophynow.org/issues/' + issuenum
|
||||||
soup = self.index_to_soup(current_issue_url)
|
soup = self.index_to_soup(current_issue_url)
|
||||||
div = soup.find ('div', attrs={'class':'articlesColumn'})
|
div = soup.find ('div', attrs={'class':'contentsColumn'})
|
||||||
|
|
||||||
feeds = OrderedDict()
|
feeds = OrderedDict()
|
||||||
|
|
||||||
for post in div.findAll('h3'):
|
|
||||||
|
for post in div.findAll('h1'):
|
||||||
articles = []
|
articles = []
|
||||||
a=post.find('a',href=True)
|
a=post.find('a',href=True)
|
||||||
if a is not None:
|
if a is not None:
|
||||||
url="http://philosophynow.org" + a['href']
|
url="http://philosophynow.org" + a['href']
|
||||||
title=self.tag_to_string(a).strip()
|
title=self.tag_to_string(a).strip()
|
||||||
s=post.findPrevious('h4')
|
s=post.findPrevious('h3')
|
||||||
section_title = self.tag_to_string(s).strip()
|
section_title = self.tag_to_string(s).strip()
|
||||||
d=post.findNext('p')
|
d=post.findNext('h2')
|
||||||
desc = self.tag_to_string(d).strip()
|
desc = self.tag_to_string(d).strip()
|
||||||
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
|
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
|
||||||
|
|
||||||
@ -73,3 +73,5 @@ class PhilosophyNow(BasicNewsRecipe):
|
|||||||
ans = [(key, val) for key, val in feeds.iteritems()]
|
ans = [(key, val) for key, val in feeds.iteritems()]
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
self.browser.open('http://philosophynow.org/auth/logout')
|
||||||
|
63
recipes/poradnia_pwn.recipe
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class PoradniaPWN(BasicNewsRecipe):
|
||||||
|
title = u'Poradnia Językowa PWN'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Internetowa poradnia językowa Wydawnictwa Naukowego PWN. Poradnię prowadzi Redaktor Naczelny Słowników Języka Polskiego, prof. Mirosław Bańko. Pomagają mu eksperci - znani polscy językoznawcy. Współpracuje z nami m.in. prof. Jerzy Bralczyk oraz dr Jan Grzenia.'
|
||||||
|
category = 'language'
|
||||||
|
language = 'pl'
|
||||||
|
#cover_url = ''
|
||||||
|
oldest_article = 14
|
||||||
|
max_articles_per_feed = 100000
|
||||||
|
INDEX = "http://poradnia.pwn.pl/"
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_attributes = ['style']
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
#preprocess_regexps = [(re.compile('<li|ul', re.IGNORECASE), lambda m: '<div'),(re.compile('</li>', re.IGNORECASE), lambda m: '</div>'), (re.compile('</ul>', re.IGNORECASE), lambda m: '</div>')]
|
||||||
|
keep_only_tags = [dict(name="div", attrs={"class":"searchhi"})]
|
||||||
|
feeds = [(u'Poradnia', u'http://rss.pwn.pl/poradnia.rss')]
|
||||||
|
|
||||||
|
'''def find_articles(self, url):
|
||||||
|
articles = []
|
||||||
|
soup=self.index_to_soup(url)
|
||||||
|
counter = int(soup.find(name='p', attrs={'class':'count'}).findAll('b')[-1].string)
|
||||||
|
counter = 500
|
||||||
|
pos = 0
|
||||||
|
next = url
|
||||||
|
while next:
|
||||||
|
soup=self.index_to_soup(next)
|
||||||
|
tag=soup.find(id="listapytan")
|
||||||
|
art=tag.findAll(name='li')
|
||||||
|
for i in art:
|
||||||
|
if i.h4:
|
||||||
|
title=i.h4.a.string
|
||||||
|
url=self.INDEX+i.h4.a['href']
|
||||||
|
#date=soup.find(id='footer').ul.li.string[41:-1]
|
||||||
|
articles.append({'title' : title,
|
||||||
|
'url' : url,
|
||||||
|
'date' : '',
|
||||||
|
'description' : ''
|
||||||
|
})
|
||||||
|
pos += 10
|
||||||
|
if not pos >=counter:
|
||||||
|
next = 'http://poradnia.pwn.pl/lista.php?kat=18&od=' + str(pos)
|
||||||
|
print u'Tworzenie listy artykułów dla', next
|
||||||
|
else:
|
||||||
|
next = None
|
||||||
|
print articles
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
feeds.append((u"Poradnia", self.find_articles('http://poradnia.pwn.pl/lista.php')))
|
||||||
|
|
||||||
|
return feeds'''
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for i in soup.findAll(name=['ul', 'li']):
|
||||||
|
i.name="div"
|
||||||
|
for z in soup.findAll(name='a'):
|
||||||
|
if not z['href'].startswith('http'):
|
||||||
|
z['href'] = 'http://poradnia.pwn.pl/' + z['href']
|
||||||
|
return soup
|
13
recipes/schattenblick.recipe
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1345802300(BasicNewsRecipe):
|
||||||
|
title = u'Online-Zeitung Schattenblick'
|
||||||
|
language = 'de'
|
||||||
|
__author__ = 'ThB'
|
||||||
|
publisher = u'MA-Verlag'
|
||||||
|
category = u'Nachrichten'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
cover_url = 'http://www.schattenblick.de/mobi/rss/cover.jpg'
|
||||||
|
feeds = [(u'Schattenblick Tagesausgabe', u'http://www.schattenblick.de/mobi/rss/rss.xml')]
|
||||||
|
|
@ -1,12 +1,13 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
class BasicUserRecipe1324913680(BasicNewsRecipe):
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class AdvancedUserRecipe1355341662(BasicNewsRecipe):
|
||||||
title = u'Sivil Dusunce'
|
title = u'Sivil Dusunce'
|
||||||
language = 'tr'
|
language = 'tr'
|
||||||
__author__ = 'asalet_r'
|
__author__ = 'asalet_r'
|
||||||
|
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 20
|
max_articles_per_feed = 50
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
|
||||||
feeds = [(u'Sivil Dusunce', u'http://www.sivildusunce.com/feed/')]
|
feeds = [(u'Sivil Dusunce', u'http://www.sivildusunce.com/?t=rss&xml=1')]
|
||||||
|
60
recipes/spectator_magazine.recipe
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Spectator Magazine'
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
description = 'Magazine'
|
||||||
|
timefmt = ' [%d %b, %Y]'
|
||||||
|
needs_subscription = False
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
#auto_cleanup = True
|
||||||
|
#auto_cleanup_keep = '//div[@class="thumbnail"]'
|
||||||
|
|
||||||
|
keep_only_tags = dict(name='div', attrs={'id':'content'})
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':['disqus_thread']}),
|
||||||
|
##dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
|
||||||
|
##dict(name='form', attrs={'onsubmit':''}),
|
||||||
|
#dict(name='section', attrs={'id':['article-quote', 'article-navigation']}),
|
||||||
|
]
|
||||||
|
|
||||||
|
#TO GET ARTICLE TOC
|
||||||
|
def spec_get_index(self):
|
||||||
|
return self.index_to_soup('http://www.spectator.co.uk/')
|
||||||
|
|
||||||
|
# To parse artice toc
|
||||||
|
def parse_index(self):
|
||||||
|
parse_soup = self.index_to_soup('http://www.spectator.co.uk/')
|
||||||
|
|
||||||
|
feeds = []
|
||||||
|
feed_title = 'Spectator Magazine Articles'
|
||||||
|
|
||||||
|
articles = []
|
||||||
|
self.log('Found section:', feed_title)
|
||||||
|
div = parse_soup.find(attrs={'class':'one-col-tax-widget magazine-list columns-1 post-8 taxonomy-category full-width widget section-widget icit-taxonomical-listings'})
|
||||||
|
for art in div.findAll(name='h2'):
|
||||||
|
art_info = art.find(name = 'a')
|
||||||
|
if art_info is None:
|
||||||
|
continue
|
||||||
|
art_title = self.tag_to_string(art_info)
|
||||||
|
url = art_info.get('href')
|
||||||
|
self.log.info('\tFound article:', art_title, 'at', url)
|
||||||
|
article = {'title':art_title, 'url':url, 'date':''}
|
||||||
|
#au = art.find(attrs={'class':'articleAuthors'})
|
||||||
|
#if au is not None:
|
||||||
|
#article['author'] = self.tag_to_string(au)
|
||||||
|
#desc = art.find(attrs={'class':'hover_text'})
|
||||||
|
#if desc is not None:
|
||||||
|
#desc = self.tag_to_string(desc)
|
||||||
|
#if 'author' in article:
|
||||||
|
#desc = ' by ' + article['author'] + ' ' +desc
|
||||||
|
#article['description'] = desc
|
||||||
|
articles.append(article)
|
||||||
|
if articles:
|
||||||
|
feeds.append((feed_title, articles))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
@ -8,19 +8,19 @@ Fetch sueddeutsche.de
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
class Sueddeutsche(BasicNewsRecipe):
|
class Sueddeutsche(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'Süddeutsche.de' # 2012-01-26 AGe Correct Title
|
title = u'Süddeutsche.de'
|
||||||
description = 'News from Germany, Access to online content' # 2012-01-26 AGe
|
description = 'News from Germany, Access to online content'
|
||||||
__author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-01-26
|
__author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-12-05
|
||||||
publisher = u'Süddeutsche Zeitung' # 2012-01-26 AGe add
|
publisher = u'Süddeutsche Zeitung'
|
||||||
category = 'news, politics, Germany' # 2012-01-26 AGe add
|
category = 'news, politics, Germany'
|
||||||
timefmt = ' [%a, %d %b %Y]' # 2012-01-26 AGe add %a
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
language = 'de'
|
language = 'de'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
publication_type = 'newspaper' # 2012-01-26 add
|
publication_type = 'newspaper'
|
||||||
cover_source = 'http://www.sueddeutsche.de/verlag' # 2012-01-26 AGe add from Darko Miletic paid content source
|
cover_source = 'http://www.sueddeutsche.de/verlag' # 2012-01-26 AGe add from Darko Miletic paid content source
|
||||||
masthead_url = 'http://www.sueddeutsche.de/static_assets/build/img/sdesiteheader/logo_homepage.441d531c.png' # 2012-01-26 AGe add
|
masthead_url = 'http://www.sueddeutsche.de/static_assets/img/sdesiteheader/logo_standard.a152b0df.png' # 2012-12-05 AGe add
|
||||||
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -40,9 +40,9 @@ class Sueddeutsche(BasicNewsRecipe):
|
|||||||
(u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
|
(u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'),
|
||||||
(u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
|
(u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'),
|
||||||
(u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
|
(u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'),
|
||||||
(u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'), #2012-01-26 AGe New
|
(u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'),
|
||||||
(u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'), #2012-01-26 AGe New
|
(u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'),
|
||||||
(u'Stil', u'http://rss.sueddeutsche.de/rss/stil'), #2012-01-26 AGe New
|
(u'Stil', u'http://rss.sueddeutsche.de/rss/stil'),
|
||||||
(u'München & Region', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMünchen&Region%24?output=rss'),
|
(u'München & Region', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMünchen&Region%24?output=rss'),
|
||||||
(u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
|
(u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'),
|
||||||
(u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),
|
(u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'),
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2012, Andreas Zeiser <andreas.zeiser@web.de>'
|
__copyright__ = '2012, 2013 Andreas Zeiser <andreas.zeiser@web.de>'
|
||||||
'''
|
'''
|
||||||
szmobil.sueddeutsche.de/
|
szmobil.sueddeutsche.de/
|
||||||
'''
|
'''
|
||||||
|
# History
|
||||||
|
# 2013.01.09 Fixed bugs in article titles containing "strong" and
|
||||||
|
# other small changes
|
||||||
|
# 2012.08.04 Initial release
|
||||||
|
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
@ -26,6 +29,8 @@ class SZmobil(BasicNewsRecipe):
|
|||||||
delay = 1
|
delay = 1
|
||||||
cover_source = 'http://www.sueddeutsche.de/verlag'
|
cover_source = 'http://www.sueddeutsche.de/verlag'
|
||||||
|
|
||||||
|
# if you want to get rid of the date on the title page use
|
||||||
|
# timefmt = ''
|
||||||
timefmt = ' [%a, %d %b, %Y]'
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
|
||||||
root_url ='http://szmobil.sueddeutsche.de/'
|
root_url ='http://szmobil.sueddeutsche.de/'
|
||||||
@ -76,10 +81,10 @@ class SZmobil(BasicNewsRecipe):
|
|||||||
# first check if link is a special article in section "Meinungsseite"
|
# first check if link is a special article in section "Meinungsseite"
|
||||||
if itt.find('strong')!= None:
|
if itt.find('strong')!= None:
|
||||||
article_name = itt.strong.string
|
article_name = itt.strong.string
|
||||||
article_shorttitle = itt.contents[1]
|
if len(itt.contents)>1:
|
||||||
|
shorttitles[article_id] = itt.contents[1]
|
||||||
|
|
||||||
articles.append( (article_name, article_url, article_id) )
|
articles.append( (article_name, article_url, article_id) )
|
||||||
shorttitles[article_id] = article_shorttitle
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
@ -89,7 +94,7 @@ class SZmobil(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
article_name = itt.string
|
article_name = itt.string
|
||||||
|
|
||||||
if (article_name[0:10] == " mehr"):
|
if (article_name.find(" mehr") == 0):
|
||||||
# just another link ("mehr") to an article
|
# just another link ("mehr") to an article
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -102,7 +107,9 @@ class SZmobil(BasicNewsRecipe):
|
|||||||
for article_name, article_url, article_id in articles:
|
for article_name, article_url, article_id in articles:
|
||||||
url = self.root_url + article_url
|
url = self.root_url + article_url
|
||||||
title = article_name
|
title = article_name
|
||||||
pubdate = strftime('%a, %d %b')
|
# if you want to get rid of date for each article use
|
||||||
|
# pubdate = strftime('')
|
||||||
|
pubdate = strftime('[%a, %d %b]')
|
||||||
description = ''
|
description = ''
|
||||||
if shorttitles.has_key(article_id):
|
if shorttitles.has_key(article_id):
|
||||||
description = shorttitles[article_id]
|
description = shorttitles[article_id]
|
||||||
@ -115,3 +122,4 @@ class SZmobil(BasicNewsRecipe):
|
|||||||
|
|
||||||
return all_articles
|
return all_articles
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,8 +16,9 @@ class TidBITS(BasicNewsRecipe):
|
|||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
#auto_cleanup = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
use_embedded_content = True
|
use_embedded_content = False
|
||||||
language = 'en'
|
language = 'en'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
masthead_url = 'http://db.tidbits.com/images/tblogo9.gif'
|
masthead_url = 'http://db.tidbits.com/images/tblogo9.gif'
|
||||||
@ -30,9 +31,11 @@ class TidBITS(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_attributes = ['width','height']
|
#remove_attributes = ['width','height']
|
||||||
remove_tags = [dict(name='small')]
|
#remove_tags = [dict(name='small')]
|
||||||
remove_tags_after = dict(name='small')
|
#remove_tags_after = dict(name='small')
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'center_ajax_sub'})]
|
||||||
|
remove_tags = [dict(name='div', attrs={'id':'social-media'})]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Business Apps' , u'http://db.tidbits.com/feeds/business.rss' )
|
(u'Business Apps' , u'http://db.tidbits.com/feeds/business.rss' )
|
||||||
|
20
recipes/titanic_de.recipe
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Titanic(BasicNewsRecipe):
|
||||||
|
title = u'Titanic'
|
||||||
|
language = 'de'
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
oldest_article = 14 #days
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
#encoding = 'cp1252'
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('News',
|
||||||
|
'http://www.titanic-magazin.de/ich.war.bei.der.waffen.rss'),
|
||||||
|
]
|
||||||
|
|
@ -26,24 +26,28 @@ class TodaysZaman_en(BasicNewsRecipe):
|
|||||||
# remove_attributes = ['width','height']
|
# remove_attributes = ['width','height']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
( u'Home', u'http://www.todayszaman.com/rss?sectionId=0'),
|
( u'Home', u'http://www.todayszaman.com/0.rss'),
|
||||||
( u'News', u'http://www.todayszaman.com/rss?sectionId=100'),
|
( u'Sports', u'http://www.todayszaman.com/5.rss'),
|
||||||
( u'Business', u'http://www.todayszaman.com/rss?sectionId=105'),
|
( u'Columnists', u'http://www.todayszaman.com/6.rss'),
|
||||||
( u'Interviews', u'http://www.todayszaman.com/rss?sectionId=8'),
|
( u'Interviews', u'http://www.todayszaman.com/9.rss'),
|
||||||
( u'Columnists', u'http://www.todayszaman.com/rss?sectionId=6'),
|
( u'News', u'http://www.todayszaman.com/100.rss'),
|
||||||
( u'Op-Ed', u'http://www.todayszaman.com/rss?sectionId=109'),
|
( u'National', u'http://www.todayszaman.com/101.rss'),
|
||||||
( u'Arts & Culture', u'http://www.todayszaman.com/rss?sectionId=110'),
|
( u'Diplomacy', u'http://www.todayszaman.com/102.rss'),
|
||||||
( u'Expat Zone', u'http://www.todayszaman.com/rss?sectionId=132'),
|
( u'World', u'http://www.todayszaman.com/104.rss'),
|
||||||
( u'Sports', u'http://www.todayszaman.com/rss?sectionId=5'),
|
( u'Business', u'http://www.todayszaman.com/105.rss'),
|
||||||
( u'Features', u'http://www.todayszaman.com/rss?sectionId=116'),
|
( u'Op-Ed', u'http://www.todayszaman.com/109.rss'),
|
||||||
( u'Travel', u'http://www.todayszaman.com/rss?sectionId=117'),
|
( u'Arts & Culture', u'http://www.todayszaman.com/110.rss'),
|
||||||
( u'Leisure', u'http://www.todayszaman.com/rss?sectionId=118'),
|
( u'Features', u'http://www.todayszaman.com/116.rss'),
|
||||||
( u'Weird But True', u'http://www.todayszaman.com/rss?sectionId=134'),
|
( u'Travel', u'http://www.todayszaman.com/117.rss'),
|
||||||
( u'Life', u'http://www.todayszaman.com/rss?sectionId=133'),
|
( u'Food', u'http://www.todayszaman.com/124.rss'),
|
||||||
( u'Health', u'http://www.todayszaman.com/rss?sectionId=126'),
|
( u'Press Review', u'http://www.todayszaman.com/130.rss'),
|
||||||
( u'Press Review', u'http://www.todayszaman.com/rss?sectionId=130'),
|
( u'Expat Zone', u'http://www.todayszaman.com/132.rss'),
|
||||||
( u'Todays think tanks', u'http://www.todayszaman.com/rss?sectionId=159'),
|
( u'Life', u'http://www.todayszaman.com/133.rss'),
|
||||||
|
( u'Think Tanks', u'http://www.todayszaman.com/159.rss'),
|
||||||
|
( u'Almanac', u'http://www.todayszaman.com/161.rss'),
|
||||||
|
( u'Health', u'http://www.todayszaman.com/162.rss'),
|
||||||
|
( u'Fashion & Beauty', u'http://www.todayszaman.com/163.rss'),
|
||||||
|
( u'Science & Technology', u'http://www.todayszaman.com/349.rss'),
|
||||||
]
|
]
|
||||||
|
|
||||||
#def preprocess_html(self, soup):
|
#def preprocess_html(self, soup):
|
||||||
@ -51,3 +55,4 @@ class TodaysZaman_en(BasicNewsRecipe):
|
|||||||
#def print_version(self, url): #there is a probem caused by table format
|
#def print_version(self, url): #there is a probem caused by table format
|
||||||
#return url.replace('http://www.todayszaman.com/newsDetail_getNewsById.action?load=detay&', 'http://www.todayszaman.com/newsDetail_openPrintPage.action?')
|
#return url.replace('http://www.todayszaman.com/newsDetail_getNewsById.action?load=detay&', 'http://www.todayszaman.com/newsDetail_openPrintPage.action?')
|
||||||
|
|
||||||
|
|
||||||
|
20
recipes/tvp_info.recipe
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class TVPINFO(BasicNewsRecipe):
|
||||||
|
title = u'TVP.INFO'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Serwis informacyjny TVP.INFO'
|
||||||
|
category = 'news'
|
||||||
|
language = 'pl'
|
||||||
|
cover_url = 'http://s.v3.tvp.pl/files/tvp-info/gfx/logo.png'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
keep_only_tags = [dict(id='contentNews')]
|
||||||
|
remove_tags = [dict(attrs={'class':['toolbox', 'modulBox read', 'modulBox social', 'videoPlayerBox']}), dict(id='belka')]
|
||||||
|
feeds = [(u'Wiadomo\u015bci', u'http://tvp.info/informacje?xslt=tvp-info/news/rss.xslt&src_id=191865'),
|
||||||
|
(u'\u015awiat', u'http://tvp.info/informacje/swiat?xslt=tvp-info/news/rss.xslt&src_id=191867'), (u'Biznes', u'http://tvp.info/informacje/biznes?xslt=tvp-info/news/rss.xslt&src_id=191868'), (u'Nauka', u'http://tvp.info/informacje/nauka?xslt=tvp-info/news/rss.xslt&src_id=191870'), (u'Kultura', u'http://tvp.info/informacje/kultura?xslt=tvp-info/news/rss.xslt&src_id=191869'), (u'Rozmaito\u015bci', u'http://tvp.info/informacje/rozmaitosci?xslt=tvp-info/news/rss.xslt&src_id=191872'), (u'Opinie', u'http://tvp.info/opinie?xslt=tvp-info/news/rss.xslt&src_id=191875'), (u'Komentarze', u'http://tvp.info/opinie/komentarze?xslt=tvp-info/news/rss.xslt&src_id=238200'), (u'Wywiady', u'http://tvp.info/opinie/wywiady?xslt=tvp-info/news/rss.xslt&src_id=236644')]
|
13
recipes/ukraiyns_kii_tizhdien.recipe
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1356283265(BasicNewsRecipe):
|
||||||
|
title = u'\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0438\u0439 \u0422\u0438\u0436\u0434\u0435\u043d\u044c'
|
||||||
|
__author__ = 'rpalyvoda'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
language = 'uk'
|
||||||
|
cover_url = 'http://tyzhden.ua/Images/Style1/tyzhden.ua-logo2.gif'
|
||||||
|
masthead_url = 'http://tyzhden.ua/Images/Style1/tyzhden.ua-logo2.gif'
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'\u041d\u043e\u0432\u0438\u043d\u0438', u'http://tyzhden.ua/RSS/News/'), (u'\u041e\u0440\u0438\u0433\u0456\u043d\u0430\u043b\u044c\u043d\u0456 \u043d\u043e\u0432\u0438\u043d\u0438', u'http://tyzhden.ua/RSS/News.Original/'), (u'\u041f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u0457', u'http://tyzhden.ua/RSS/Publications/')]
|
16
recipes/zaufana_trzecia_strona.recipe
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class ZTS(BasicNewsRecipe):
|
||||||
|
title = u'Zaufana Trzecia Strona'
|
||||||
|
__author__ = 'fenuks'
|
||||||
|
description = u'Niezależne źródło wiadomości o świecie bezpieczeństwa IT'
|
||||||
|
category = 'IT, security'
|
||||||
|
language = 'pl'
|
||||||
|
cover_url = 'http://www.zaufanatrzeciastrona.pl/wp-content/uploads/2012/08/z3s_h100.png'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'post postcontent'})]
|
||||||
|
remove_tags = [dict(name='div', attrs={'class':'dolna-ramka'})]
|
||||||
|
feeds = [(u'Strona g\u0142\xf3wna', u'http://feeds.feedburner.com/ZaufanaTrzeciaStronaGlowna'), (u'Drobiazgi', u'http://feeds.feedburner.com/ZaufanaTrzeciaStronaDrobiazgi')]
|
13
recipes/zaxid_net.recipe
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1356281741(BasicNewsRecipe):
|
||||||
|
title = u'Zaxid.net'
|
||||||
|
__author__ = 'rpalyvoda'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
language = 'uk'
|
||||||
|
cover_url = 'http://upload.wikimedia.org/wikipedia/uk/b/bc/Zaxid-net.jpg'
|
||||||
|
masthead_url = 'http://upload.wikimedia.org/wikipedia/uk/b/bc/Zaxid-net.jpg'
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'\u0422\u043e\u043f \u043d\u043e\u0432\u0438\u043d\u0438', u'http://feeds.feedburner.com/zaxid/topNews'), (u'\u0421\u0442\u0440\u0456\u0447\u043a\u0430 \u043d\u043e\u0432\u0438\u043d', u'http://feeds.feedburner.com/zaxid/AllNews'), (u'\u041d\u043e\u0432\u0438\u043d\u0438 \u041b\u044c\u0432\u043e\u0432\u0430', u'http://feeds.feedburner.com/zaxid/Lviv'), (u'\u041d\u043e\u0432\u0438\u043d\u0438 \u0423\u043a\u0440\u0430\u0457\u043d\u0438', u'http://feeds.feedburner.com/zaxid/Ukraine'), (u'\u041d\u043e\u0432\u0438\u043d\u0438 \u0441\u0432\u0456\u0442\u0443', u'http://feeds.feedburner.com/zaxid/World'), (u'\u041d\u043e\u0432\u0438\u043d\u0438 - \u0420\u0430\u0434\u0456\u043e 24', u'\u0420\u0430\u0434\u0456\u043e 24'), (u'\u0411\u043b\u043e\u0433\u0438', u'http://feeds.feedburner.com/zaxid/Blogs'), (u"\u041f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u0457 - \u0406\u043d\u0442\u0435\u0440\u0432'\u044e", u'http://feeds.feedburner.com/zaxid/Interview'), (u'\u041f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u0457 - \u0421\u0442\u0430\u0442\u0442\u0456', u'http://feeds.feedburner.com/zaxid/Articles'), (u'\u0410\u0444\u0456\u0448\u0430', u'http://zaxid.net/rss/subcategory/140.xml'), (u'\u0413\u0430\u043b\u0438\u0447\u0438\u043d\u0430', u'http://feeds.feedburner.com/zaxid/Galicia'), (u'\u041a\u0443\u043b\u044c\u0442\u0443\u0440\u0430.NET', u'http://feeds.feedburner.com/zaxid/KulturaNET'), (u"\u043d\u0435\u0412\u0456\u0434\u043e\u043c\u0456 \u043b\u044c\u0432\u0456\u0432'\u044f\u043d\u0438", u'http://feeds.feedburner.com/zaxid/UnknownLviv'), (u'\u041b\u0435\u043e\u043f\u043e\u043b\u0456\u0441 MULTIPLEX', u'http://feeds.feedburner.com/zaxid/LeopolisMULTIPLEX'), (u'\u0411\u0438\u0442\u0432\u0430 \u0437\u0430 \u043c\u043e\u0432\u0443', u'http://zaxid.net/rss/subcategory/138.xml'), (u'\u0422\u0440\u0430\u043d\u0441\u043f\u043e\u0440\u0442\u043d\u0430 \u0441\u0445\u0435\u043c\u0430 \u041b\u044c\u0432\u043e\u0432\u0430', u'http://zaxid.net/rss/subcategory/132.xml'), (u'\u0414\u0435\u043c\u0456\u0444\u043e\u043b\u043e\u0433\u0456\u0437\u0430\u0446\u0456\u044f', u'http://zaxid.net/rss/subcategory/130.xml'), (u"\u041c\u0438 \u043f\u0430\u043c'\u044f\u0442\u0430\u0454\u043c\u043e", u'http://feeds.feedburner.com/zaxid/WeRemember'), (u'20 \u0440\u043e\u043a\u0456\u0432 \u041d\u0435\u0437\u0430\u043b\u0435\u0436\u043d\u043e\u0441\u0442\u0456', u'http://zaxid.net/rss/subcategory/129.xml'), (u'\u041f\u0440\u0430\u0432\u043e \u043d\u0430 \u0434\u0438\u0442\u0438\u043d\u0441\u0442\u0432\u043e', u'http://feeds.feedburner.com/zaxid/Childhood'), (u'\u0410\u043d\u043e\u043d\u0441\u0438', u'http://feeds.feedburner.com/zaxid/Announcements')]
|
@ -81,6 +81,7 @@ body {
|
|||||||
background-color: #39a9cf;
|
background-color: #39a9cf;
|
||||||
-moz-border-radius: 5px;
|
-moz-border-radius: 5px;
|
||||||
-webkit-border-radius: 5px;
|
-webkit-border-radius: 5px;
|
||||||
|
border-radius: 5px;
|
||||||
text-shadow: #27211b 1px 1px 1px;
|
text-shadow: #27211b 1px 1px 1px;
|
||||||
-moz-box-shadow: 5px 5px 5px #222;
|
-moz-box-shadow: 5px 5px 5px #222;
|
||||||
-webkit-box-shadow: 5px 5px 5px #222;
|
-webkit-box-shadow: 5px 5px 5px #222;
|
||||||
|
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 62 KiB |
@ -12,6 +12,7 @@ let g:syntastic_cpp_include_dirs = [
|
|||||||
\'/usr/include/fontconfig',
|
\'/usr/include/fontconfig',
|
||||||
\'src/qtcurve/common', 'src/qtcurve',
|
\'src/qtcurve/common', 'src/qtcurve',
|
||||||
\'src/unrar',
|
\'src/unrar',
|
||||||
|
\'src/qt-harfbuzz/src',
|
||||||
\'/usr/include/ImageMagick',
|
\'/usr/include/ImageMagick',
|
||||||
\]
|
\]
|
||||||
let g:syntastic_c_include_dirs = g:syntastic_cpp_include_dirs
|
let g:syntastic_c_include_dirs = g:syntastic_cpp_include_dirs
|
||||||
|
@ -6,12 +6,13 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, socket, struct, subprocess, sys, glob
|
import os, socket, struct, subprocess, glob
|
||||||
from distutils.spawn import find_executable
|
from distutils.spawn import find_executable
|
||||||
|
|
||||||
from PyQt4 import pyqtconfig
|
from PyQt4 import pyqtconfig
|
||||||
|
|
||||||
from setup import isosx, iswindows, islinux, is64bit
|
from setup import isosx, iswindows, islinux, is64bit
|
||||||
|
is64bit
|
||||||
|
|
||||||
OSX_SDK = '/Developer/SDKs/MacOSX10.5.sdk'
|
OSX_SDK = '/Developer/SDKs/MacOSX10.5.sdk'
|
||||||
|
|
||||||
@ -81,6 +82,7 @@ def consolidate(envvar, default):
|
|||||||
pyqt = pyqtconfig.Configuration()
|
pyqt = pyqtconfig.Configuration()
|
||||||
|
|
||||||
qt_inc = pyqt.qt_inc_dir
|
qt_inc = pyqt.qt_inc_dir
|
||||||
|
qt_private_inc = []
|
||||||
qt_lib = pyqt.qt_lib_dir
|
qt_lib = pyqt.qt_lib_dir
|
||||||
ft_lib_dirs = []
|
ft_lib_dirs = []
|
||||||
ft_libs = []
|
ft_libs = []
|
||||||
@ -140,6 +142,8 @@ elif isosx:
|
|||||||
png_libs = ['png12']
|
png_libs = ['png12']
|
||||||
ft_libs = ['freetype']
|
ft_libs = ['freetype']
|
||||||
ft_inc_dirs = ['/sw/include/freetype2']
|
ft_inc_dirs = ['/sw/include/freetype2']
|
||||||
|
bq = glob.glob('/sw/build/qt-*/include')[-1]
|
||||||
|
qt_private_inc = ['%s/%s'%(bq, m) for m in ('QtGui', 'QtCore')]
|
||||||
else:
|
else:
|
||||||
# Include directories
|
# Include directories
|
||||||
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
|
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
|
||||||
|
@ -102,7 +102,8 @@ class Check(Command):
|
|||||||
errors = True
|
errors = True
|
||||||
if errors:
|
if errors:
|
||||||
cPickle.dump(cache, open(self.CACHE, 'wb'), -1)
|
cPickle.dump(cache, open(self.CACHE, 'wb'), -1)
|
||||||
subprocess.call(['gvim', '-f', f])
|
subprocess.call(['gvim', '-S',
|
||||||
|
self.j(self.SRC, '../session.vim'), '-f', f])
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
cache[f] = mtime
|
cache[f] = mtime
|
||||||
for x in builtins:
|
for x in builtins:
|
||||||
|
@ -18,7 +18,7 @@ from setup.build_environment import (chmlib_inc_dirs,
|
|||||||
msvc, MT, win_inc, win_lib, win_ddk, magick_inc_dirs, magick_lib_dirs,
|
msvc, MT, win_inc, win_lib, win_ddk, magick_inc_dirs, magick_lib_dirs,
|
||||||
magick_libs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs,
|
magick_libs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs,
|
||||||
icu_lib_dirs, win_ddk_lib_dirs, ft_libs, ft_lib_dirs, ft_inc_dirs,
|
icu_lib_dirs, win_ddk_lib_dirs, ft_libs, ft_lib_dirs, ft_inc_dirs,
|
||||||
zlib_libs, zlib_lib_dirs, zlib_inc_dirs, is64bit)
|
zlib_libs, zlib_lib_dirs, zlib_inc_dirs, is64bit, qt_private_inc)
|
||||||
MT
|
MT
|
||||||
isunix = islinux or isosx or isbsd
|
isunix = islinux or isosx or isbsd
|
||||||
|
|
||||||
@ -183,6 +183,13 @@ extensions = [
|
|||||||
sip_files = ['calibre/gui2/progress_indicator/QProgressIndicator.sip']
|
sip_files = ['calibre/gui2/progress_indicator/QProgressIndicator.sip']
|
||||||
),
|
),
|
||||||
|
|
||||||
|
Extension('qt_hack',
|
||||||
|
['calibre/ebooks/pdf/render/qt_hack.cpp'],
|
||||||
|
inc_dirs = qt_private_inc + ['calibre/ebooks/pdf/render', 'qt-harfbuzz/src'],
|
||||||
|
headers = ['calibre/ebooks/pdf/render/qt_hack.h'],
|
||||||
|
sip_files = ['calibre/ebooks/pdf/render/qt_hack.sip']
|
||||||
|
),
|
||||||
|
|
||||||
Extension('unrar',
|
Extension('unrar',
|
||||||
['unrar/%s.cpp'%(x.partition('.')[0]) for x in '''
|
['unrar/%s.cpp'%(x.partition('.')[0]) for x in '''
|
||||||
rar.o strlist.o strfn.o pathfn.o savepos.o smallfn.o global.o file.o
|
rar.o strlist.o strfn.o pathfn.o savepos.o smallfn.o global.o file.o
|
||||||
@ -545,6 +552,9 @@ class Build(Command):
|
|||||||
VERSION = 1.0.0
|
VERSION = 1.0.0
|
||||||
CONFIG += %s
|
CONFIG += %s
|
||||||
''')%(ext.name, ' '.join(ext.headers), ' '.join(ext.sources), archs)
|
''')%(ext.name, ' '.join(ext.headers), ' '.join(ext.sources), archs)
|
||||||
|
if ext.inc_dirs:
|
||||||
|
idir = ' '.join(ext.inc_dirs)
|
||||||
|
pro += 'INCLUDEPATH = %s\n'%idir
|
||||||
pro = pro.replace('\\', '\\\\')
|
pro = pro.replace('\\', '\\\\')
|
||||||
open(ext.name+'.pro', 'wb').write(pro)
|
open(ext.name+'.pro', 'wb').write(pro)
|
||||||
qmc = [QMAKE, '-o', 'Makefile']
|
qmc = [QMAKE, '-o', 'Makefile']
|
||||||
|
@ -102,7 +102,8 @@ class Win32Freeze(Command, WixMixIn):
|
|||||||
repl_pat = re.compile(
|
repl_pat = re.compile(
|
||||||
r'(?is)<dependency>.*?Microsoft\.VC\d+\.CRT.*?</dependency>')
|
r'(?is)<dependency>.*?Microsoft\.VC\d+\.CRT.*?</dependency>')
|
||||||
|
|
||||||
for dll in glob.glob(self.j(self.dll_dir, '*.dll')):
|
for dll in (glob.glob(self.j(self.dll_dir, '*.dll')) +
|
||||||
|
glob.glob(self.j(self.plugins_dir, '*.pyd'))):
|
||||||
bn = self.b(dll)
|
bn = self.b(dll)
|
||||||
with open(dll, 'rb') as f:
|
with open(dll, 'rb') as f:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
@ -598,6 +599,10 @@ class Win32Freeze(Command, WixMixIn):
|
|||||||
# from files
|
# from files
|
||||||
'unrar.pyd', 'wpd.pyd', 'podofo.pyd',
|
'unrar.pyd', 'wpd.pyd', 'podofo.pyd',
|
||||||
'progress_indicator.pyd',
|
'progress_indicator.pyd',
|
||||||
|
# As per this https://bugs.launchpad.net/bugs/1087816
|
||||||
|
# on some systems magick.pyd fails to load from memory
|
||||||
|
# on 64 bit
|
||||||
|
'magick.pyd',
|
||||||
}:
|
}:
|
||||||
self.add_to_zipfile(zf, pyd, x)
|
self.add_to_zipfile(zf, pyd, x)
|
||||||
os.remove(self.j(x, pyd))
|
os.remove(self.j(x, pyd))
|
||||||
|
1438
setup/iso_639/ca.po
@ -9,14 +9,14 @@ msgstr ""
|
|||||||
"Project-Id-Version: calibre\n"
|
"Project-Id-Version: calibre\n"
|
||||||
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
"Report-Msgid-Bugs-To: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||||
"PO-Revision-Date: 2012-08-15 10:30+0000\n"
|
"PO-Revision-Date: 2012-12-28 09:13+0000\n"
|
||||||
"Last-Translator: Jellby <Unknown>\n"
|
"Last-Translator: Jellby <Unknown>\n"
|
||||||
"Language-Team: Español; Castellano <>\n"
|
"Language-Team: Español; Castellano <>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2012-08-16 04:40+0000\n"
|
"X-Launchpad-Export-Date: 2012-12-29 05:00+0000\n"
|
||||||
"X-Generator: Launchpad (build 15810)\n"
|
"X-Generator: Launchpad (build 16378)\n"
|
||||||
|
|
||||||
#. name for aaa
|
#. name for aaa
|
||||||
msgid "Ghotuo"
|
msgid "Ghotuo"
|
||||||
@ -9584,27 +9584,27 @@ msgstr "Holikachuk"
|
|||||||
|
|
||||||
#. name for hoj
|
#. name for hoj
|
||||||
msgid "Hadothi"
|
msgid "Hadothi"
|
||||||
msgstr ""
|
msgstr "Hadoti"
|
||||||
|
|
||||||
#. name for hol
|
#. name for hol
|
||||||
msgid "Holu"
|
msgid "Holu"
|
||||||
msgstr ""
|
msgstr "Holu"
|
||||||
|
|
||||||
#. name for hom
|
#. name for hom
|
||||||
msgid "Homa"
|
msgid "Homa"
|
||||||
msgstr ""
|
msgstr "Homa"
|
||||||
|
|
||||||
#. name for hoo
|
#. name for hoo
|
||||||
msgid "Holoholo"
|
msgid "Holoholo"
|
||||||
msgstr ""
|
msgstr "Holoholo"
|
||||||
|
|
||||||
#. name for hop
|
#. name for hop
|
||||||
msgid "Hopi"
|
msgid "Hopi"
|
||||||
msgstr ""
|
msgstr "Hopi"
|
||||||
|
|
||||||
#. name for hor
|
#. name for hor
|
||||||
msgid "Horo"
|
msgid "Horo"
|
||||||
msgstr ""
|
msgstr "Horo"
|
||||||
|
|
||||||
#. name for hos
|
#. name for hos
|
||||||
msgid "Ho Chi Minh City Sign Language"
|
msgid "Ho Chi Minh City Sign Language"
|
||||||
@ -9612,27 +9612,27 @@ msgstr "Lengua de signos de Ho Chi Minh"
|
|||||||
|
|
||||||
#. name for hot
|
#. name for hot
|
||||||
msgid "Hote"
|
msgid "Hote"
|
||||||
msgstr ""
|
msgstr "Hote"
|
||||||
|
|
||||||
#. name for hov
|
#. name for hov
|
||||||
msgid "Hovongan"
|
msgid "Hovongan"
|
||||||
msgstr ""
|
msgstr "Hovongan"
|
||||||
|
|
||||||
#. name for how
|
#. name for how
|
||||||
msgid "Honi"
|
msgid "Honi"
|
||||||
msgstr ""
|
msgstr "Honi"
|
||||||
|
|
||||||
#. name for hoy
|
#. name for hoy
|
||||||
msgid "Holiya"
|
msgid "Holiya"
|
||||||
msgstr ""
|
msgstr "Holiya"
|
||||||
|
|
||||||
#. name for hoz
|
#. name for hoz
|
||||||
msgid "Hozo"
|
msgid "Hozo"
|
||||||
msgstr ""
|
msgstr "Hozo"
|
||||||
|
|
||||||
#. name for hpo
|
#. name for hpo
|
||||||
msgid "Hpon"
|
msgid "Hpon"
|
||||||
msgstr ""
|
msgstr "Hpon"
|
||||||
|
|
||||||
#. name for hps
|
#. name for hps
|
||||||
msgid "Hawai'i Pidgin Sign Language"
|
msgid "Hawai'i Pidgin Sign Language"
|
||||||
@ -9640,15 +9640,15 @@ msgstr "Lengua de signos pidyin hawaiana"
|
|||||||
|
|
||||||
#. name for hra
|
#. name for hra
|
||||||
msgid "Hrangkhol"
|
msgid "Hrangkhol"
|
||||||
msgstr ""
|
msgstr "Hrangkhol"
|
||||||
|
|
||||||
#. name for hre
|
#. name for hre
|
||||||
msgid "Hre"
|
msgid "Hre"
|
||||||
msgstr ""
|
msgstr "Hre"
|
||||||
|
|
||||||
#. name for hrk
|
#. name for hrk
|
||||||
msgid "Haruku"
|
msgid "Haruku"
|
||||||
msgstr ""
|
msgstr "Haruku"
|
||||||
|
|
||||||
#. name for hrm
|
#. name for hrm
|
||||||
msgid "Miao; Horned"
|
msgid "Miao; Horned"
|
||||||
@ -9656,19 +9656,19 @@ msgstr ""
|
|||||||
|
|
||||||
#. name for hro
|
#. name for hro
|
||||||
msgid "Haroi"
|
msgid "Haroi"
|
||||||
msgstr ""
|
msgstr "Haroi"
|
||||||
|
|
||||||
#. name for hrr
|
#. name for hrr
|
||||||
msgid "Horuru"
|
msgid "Horuru"
|
||||||
msgstr ""
|
msgstr "Horuru"
|
||||||
|
|
||||||
#. name for hrt
|
#. name for hrt
|
||||||
msgid "Hértevin"
|
msgid "Hértevin"
|
||||||
msgstr ""
|
msgstr "Hértevin"
|
||||||
|
|
||||||
#. name for hru
|
#. name for hru
|
||||||
msgid "Hruso"
|
msgid "Hruso"
|
||||||
msgstr ""
|
msgstr "Hruso"
|
||||||
|
|
||||||
#. name for hrv
|
#. name for hrv
|
||||||
msgid "Croatian"
|
msgid "Croatian"
|
||||||
@ -11796,7 +11796,7 @@ msgstr ""
|
|||||||
|
|
||||||
#. name for khq
|
#. name for khq
|
||||||
msgid "Songhay; Koyra Chiini"
|
msgid "Songhay; Koyra Chiini"
|
||||||
msgstr ""
|
msgstr "Songhay koyra chiini"
|
||||||
|
|
||||||
#. name for khr
|
#. name for khr
|
||||||
msgid "Kharia"
|
msgid "Kharia"
|
||||||
|
@ -12,14 +12,14 @@ msgstr ""
|
|||||||
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
"Report-Msgid-Bugs-To: Debian iso-codes team <pkg-isocodes-"
|
||||||
"devel@lists.alioth.debian.org>\n"
|
"devel@lists.alioth.debian.org>\n"
|
||||||
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
"POT-Creation-Date: 2011-11-25 14:01+0000\n"
|
||||||
"PO-Revision-Date: 2011-09-27 15:44+0000\n"
|
"PO-Revision-Date: 2012-12-13 13:56+0000\n"
|
||||||
"Last-Translator: IIDA Yosiaki <iida@gnu.org>\n"
|
"Last-Translator: Shushi Kurose <md81bird@hitaki.net>\n"
|
||||||
"Language-Team: Japanese <translation-team-ja@lists.sourceforge.net>\n"
|
"Language-Team: Japanese <translation-team-ja@lists.sourceforge.net>\n"
|
||||||
"MIME-Version: 1.0\n"
|
"MIME-Version: 1.0\n"
|
||||||
"Content-Type: text/plain; charset=UTF-8\n"
|
"Content-Type: text/plain; charset=UTF-8\n"
|
||||||
"Content-Transfer-Encoding: 8bit\n"
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
"X-Launchpad-Export-Date: 2011-11-26 05:21+0000\n"
|
"X-Launchpad-Export-Date: 2012-12-14 05:34+0000\n"
|
||||||
"X-Generator: Launchpad (build 14381)\n"
|
"X-Generator: Launchpad (build 16369)\n"
|
||||||
"Language: ja\n"
|
"Language: ja\n"
|
||||||
|
|
||||||
#. name for aaa
|
#. name for aaa
|
||||||
@ -86,12 +86,9 @@ msgstr ""
|
|||||||
msgid "Abnaki; Eastern"
|
msgid "Abnaki; Eastern"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
# 以下「国国」は、国立国会図書館のサイト。
|
|
||||||
# ジブチ
|
|
||||||
# マイペディア「ジブチ」の項に「アファル語」
|
|
||||||
#. name for aar
|
#. name for aar
|
||||||
msgid "Afar"
|
msgid "Afar"
|
||||||
msgstr "アファール語"
|
msgstr "アファル語"
|
||||||
|
|
||||||
#. name for aas
|
#. name for aas
|
||||||
msgid "Aasáx"
|
msgid "Aasáx"
|
||||||
|
14524
setup/iso_639/ms.po
@ -227,9 +227,22 @@ class GetTranslations(Translations): # {{{
|
|||||||
ans.append(line.split()[-1])
|
ans.append(line.split()[-1])
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def resolve_conflicts(self):
|
||||||
|
conflict = False
|
||||||
|
for line in subprocess.check_output(['bzr', 'status']).splitlines():
|
||||||
|
if line == 'conflicts:':
|
||||||
|
conflict = True
|
||||||
|
break
|
||||||
|
if not conflict:
|
||||||
|
raise Exception('bzr merge failed and no conflicts found')
|
||||||
|
subprocess.check_call(['bzr', 'resolve', '--take-other'])
|
||||||
|
|
||||||
def run(self, opts):
|
def run(self, opts):
|
||||||
if not self.modified_translations:
|
if not self.modified_translations:
|
||||||
|
try:
|
||||||
subprocess.check_call(['bzr', 'merge', self.BRANCH])
|
subprocess.check_call(['bzr', 'merge', self.BRANCH])
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
self.resolve_conflicts()
|
||||||
self.check_for_errors()
|
self.check_for_errors()
|
||||||
|
|
||||||
if self.modified_translations:
|
if self.modified_translations:
|
||||||
|
@ -4,7 +4,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
__appname__ = u'calibre'
|
__appname__ = u'calibre'
|
||||||
numeric_version = (0, 9, 8)
|
numeric_version = (0, 9, 14)
|
||||||
__version__ = u'.'.join(map(unicode, numeric_version))
|
__version__ = u'.'.join(map(unicode, numeric_version))
|
||||||
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = u"Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
@ -100,6 +100,7 @@ class Plugins(collections.Mapping):
|
|||||||
'freetype',
|
'freetype',
|
||||||
'woff',
|
'woff',
|
||||||
'unrar',
|
'unrar',
|
||||||
|
'qt_hack',
|
||||||
]
|
]
|
||||||
if iswindows:
|
if iswindows:
|
||||||
plugins.extend(['winutil', 'wpd', 'winfonts'])
|
plugins.extend(['winutil', 'wpd', 'winfonts'])
|
||||||
|
@ -661,7 +661,7 @@ from calibre.devices.nuut2.driver import NUUT2
|
|||||||
from calibre.devices.iriver.driver import IRIVER_STORY
|
from calibre.devices.iriver.driver import IRIVER_STORY
|
||||||
from calibre.devices.binatone.driver import README
|
from calibre.devices.binatone.driver import README
|
||||||
from calibre.devices.hanvon.driver import (N516, EB511, ALEX, AZBOOKA, THEBOOK,
|
from calibre.devices.hanvon.driver import (N516, EB511, ALEX, AZBOOKA, THEBOOK,
|
||||||
LIBREAIR, ODYSSEY)
|
LIBREAIR, ODYSSEY, KIBANO)
|
||||||
from calibre.devices.edge.driver import EDGE
|
from calibre.devices.edge.driver import EDGE
|
||||||
from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS,
|
from calibre.devices.teclast.driver import (TECLAST_K3, NEWSMY, IPAPYRUS,
|
||||||
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH, WEXLER)
|
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O, STASH, WEXLER)
|
||||||
@ -712,7 +712,7 @@ plugins += [
|
|||||||
BOOQ,
|
BOOQ,
|
||||||
EB600,
|
EB600,
|
||||||
README,
|
README,
|
||||||
N516,
|
N516, KIBANO,
|
||||||
THEBOOK, LIBREAIR,
|
THEBOOK, LIBREAIR,
|
||||||
EB511,
|
EB511,
|
||||||
ELONEX,
|
ELONEX,
|
||||||
@ -1529,6 +1529,15 @@ class StoreNextoStore(StoreBase):
|
|||||||
formats = ['EPUB', 'MOBI', 'PDF']
|
formats = ['EPUB', 'MOBI', 'PDF']
|
||||||
affiliate = True
|
affiliate = True
|
||||||
|
|
||||||
|
class StoreNookUKStore(StoreBase):
|
||||||
|
name = 'Nook UK'
|
||||||
|
author = 'John Schember'
|
||||||
|
description = u'Barnes & Noble S.à r.l, a subsidiary of Barnes & Noble, Inc., a leading retailer of content, digital media and educational products, is proud to bring the award-winning NOOK® reading experience and a leading digital bookstore to the UK.'
|
||||||
|
actual_plugin = 'calibre.gui2.store.stores.nook_uk_plugin:NookUKStore'
|
||||||
|
|
||||||
|
headquarters = 'UK'
|
||||||
|
formats = ['NOOK']
|
||||||
|
|
||||||
class StoreOpenBooksStore(StoreBase):
|
class StoreOpenBooksStore(StoreBase):
|
||||||
name = 'Open Books'
|
name = 'Open Books'
|
||||||
description = u'Comprehensive listing of DRM free ebooks from a variety of sources provided by users of calibre.'
|
description = u'Comprehensive listing of DRM free ebooks from a variety of sources provided by users of calibre.'
|
||||||
@ -1660,7 +1669,7 @@ plugins += [
|
|||||||
StoreAmazonITKindleStore,
|
StoreAmazonITKindleStore,
|
||||||
StoreAmazonUKKindleStore,
|
StoreAmazonUKKindleStore,
|
||||||
StoreBaenWebScriptionStore,
|
StoreBaenWebScriptionStore,
|
||||||
StoreBNStore, StoreSonyStore,
|
StoreBNStore,
|
||||||
StoreBeWriteStore,
|
StoreBeWriteStore,
|
||||||
StoreBiblioStore,
|
StoreBiblioStore,
|
||||||
StoreBookotekaStore,
|
StoreBookotekaStore,
|
||||||
@ -1686,12 +1695,14 @@ plugins += [
|
|||||||
StoreMillsBoonUKStore,
|
StoreMillsBoonUKStore,
|
||||||
StoreMobileReadStore,
|
StoreMobileReadStore,
|
||||||
StoreNextoStore,
|
StoreNextoStore,
|
||||||
|
StoreNookUKStore,
|
||||||
StoreOpenBooksStore,
|
StoreOpenBooksStore,
|
||||||
StoreOzonRUStore,
|
StoreOzonRUStore,
|
||||||
StorePragmaticBookshelfStore,
|
StorePragmaticBookshelfStore,
|
||||||
StorePublioStore,
|
StorePublioStore,
|
||||||
StoreRW2010Store,
|
StoreRW2010Store,
|
||||||
StoreSmashwordsStore,
|
StoreSmashwordsStore,
|
||||||
|
StoreSonyStore,
|
||||||
StoreVirtualoStore,
|
StoreVirtualoStore,
|
||||||
StoreWaterstonesUKStore,
|
StoreWaterstonesUKStore,
|
||||||
StoreWeightlessBooksStore,
|
StoreWeightlessBooksStore,
|
||||||
|
@ -121,6 +121,8 @@ def debug(ioreg_to_tmp=False, buf=None, plugins=None,
|
|||||||
out('\nDisabled plugins:', textwrap.fill(' '.join([x.__class__.__name__ for x in
|
out('\nDisabled plugins:', textwrap.fill(' '.join([x.__class__.__name__ for x in
|
||||||
disabled_plugins])))
|
disabled_plugins])))
|
||||||
out(' ')
|
out(' ')
|
||||||
|
else:
|
||||||
|
out('\nNo disabled plugins')
|
||||||
found_dev = False
|
found_dev = False
|
||||||
for dev in devplugins:
|
for dev in devplugins:
|
||||||
if not dev.MANAGES_DEVICE_PRESENCE: continue
|
if not dev.MANAGES_DEVICE_PRESENCE: continue
|
||||||
|
@ -10,7 +10,7 @@ import cStringIO
|
|||||||
|
|
||||||
from calibre.devices.usbms.driver import USBMS
|
from calibre.devices.usbms.driver import USBMS
|
||||||
|
|
||||||
HTC_BCDS = [0x100, 0x0222, 0x0226, 0x227, 0x228, 0x229, 0x9999]
|
HTC_BCDS = [0x100, 0x0222, 0x0226, 0x227, 0x228, 0x229, 0x0231, 0x9999]
|
||||||
|
|
||||||
class ANDROID(USBMS):
|
class ANDROID(USBMS):
|
||||||
|
|
||||||
@ -48,6 +48,7 @@ class ANDROID(USBMS):
|
|||||||
0x2910 : HTC_BCDS,
|
0x2910 : HTC_BCDS,
|
||||||
0xe77 : HTC_BCDS,
|
0xe77 : HTC_BCDS,
|
||||||
0xff9 : HTC_BCDS,
|
0xff9 : HTC_BCDS,
|
||||||
|
0x0001 : [0x255],
|
||||||
},
|
},
|
||||||
|
|
||||||
# Eken
|
# Eken
|
||||||
@ -92,7 +93,7 @@ class ANDROID(USBMS):
|
|||||||
# Google
|
# Google
|
||||||
0x18d1 : {
|
0x18d1 : {
|
||||||
0x0001 : [0x0223, 0x230, 0x9999],
|
0x0001 : [0x0223, 0x230, 0x9999],
|
||||||
0x0003 : [0x0230],
|
0x0003 : [0x0230, 0x9999],
|
||||||
0x4e11 : [0x0100, 0x226, 0x227],
|
0x4e11 : [0x0100, 0x226, 0x227],
|
||||||
0x4e12 : [0x0100, 0x226, 0x227],
|
0x4e12 : [0x0100, 0x226, 0x227],
|
||||||
0x4e21 : [0x0100, 0x226, 0x227, 0x231],
|
0x4e21 : [0x0100, 0x226, 0x227, 0x231],
|
||||||
@ -190,7 +191,7 @@ class ANDROID(USBMS):
|
|||||||
0x10a9 : { 0x6050 : [0x227] },
|
0x10a9 : { 0x6050 : [0x227] },
|
||||||
|
|
||||||
# Prestigio
|
# Prestigio
|
||||||
0x2207 : { 0 : [0x222] },
|
0x2207 : { 0 : [0x222], 0x10 : [0x222] },
|
||||||
|
|
||||||
}
|
}
|
||||||
EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books',
|
EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books',
|
||||||
@ -212,7 +213,8 @@ class ANDROID(USBMS):
|
|||||||
'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP',
|
'VIZIO', 'GOOGLE', 'FREESCAL', 'KOBO_INC', 'LENOVO', 'ROCKCHIP',
|
||||||
'POCKET', 'ONDA_MID', 'ZENITHIN', 'INGENIC', 'PMID701C', 'PD',
|
'POCKET', 'ONDA_MID', 'ZENITHIN', 'INGENIC', 'PMID701C', 'PD',
|
||||||
'PMP5097C', 'MASS', 'NOVO7', 'ZEKI', 'COBY', 'SXZ', 'USB_2.0',
|
'PMP5097C', 'MASS', 'NOVO7', 'ZEKI', 'COBY', 'SXZ', 'USB_2.0',
|
||||||
'COBY_MID', 'VS', 'AINOL', 'TOPWISE', 'PAD703']
|
'COBY_MID', 'VS', 'AINOL', 'TOPWISE', 'PAD703', 'NEXT8D12',
|
||||||
|
'MEDIATEK', 'KEENHI']
|
||||||
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
||||||
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
||||||
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID',
|
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID',
|
||||||
@ -232,7 +234,8 @@ class ANDROID(USBMS):
|
|||||||
'THINKPAD_TABLET', 'SGH-T989', 'YP-G70', 'STORAGE_DEVICE',
|
'THINKPAD_TABLET', 'SGH-T989', 'YP-G70', 'STORAGE_DEVICE',
|
||||||
'ADVANCED', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID',
|
'ADVANCED', 'SGH-I727', 'USB_FLASH_DRIVER', 'ANDROID',
|
||||||
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
|
'S5830I_CARD', 'MID7042', 'LINK-CREATE', '7035', 'VIEWPAD_7E',
|
||||||
'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC']
|
'NOVO7', 'MB526', '_USB#WYK7MSF8KE', 'TABLET_PC', 'F', 'MT65XX_MS',
|
||||||
|
'ICS']
|
||||||
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
|
||||||
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
'FILE-STOR_GADGET', 'SGH-T959_CARD', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
|
||||||
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
|
'A70S', 'A101IT', '7', 'INCREDIBLE', 'A7EB', 'SGH-T849_CARD',
|
||||||
@ -243,7 +246,7 @@ class ANDROID(USBMS):
|
|||||||
'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0', 'XT875',
|
'FILE-CD_GADGET', 'GT-I9001_CARD', 'USB_2.0', 'XT875',
|
||||||
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
|
'UMS_COMPOSITE', 'PRO', '.KOBO_VOX', 'SGH-T989_CARD', 'SGH-I727',
|
||||||
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
|
'USB_FLASH_DRIVER', 'ANDROID', 'MID7042', '7035', 'VIEWPAD_7E',
|
||||||
'NOVO7', 'ADVANCED', 'TABLET_PC']
|
'NOVO7', 'ADVANCED', 'TABLET_PC', 'F']
|
||||||
|
|
||||||
OSX_MAIN_MEM = 'Android Device Main Memory'
|
OSX_MAIN_MEM = 'Android Device Main Memory'
|
||||||
|
|
||||||
|
@ -214,9 +214,9 @@ class ITUNES(DriverBase):
|
|||||||
"Cannot copy books directly from iDevice. "
|
"Cannot copy books directly from iDevice. "
|
||||||
"Drag from iTunes Library to desktop, then add to calibre's Library window.")
|
"Drag from iTunes Library to desktop, then add to calibre's Library window.")
|
||||||
UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE = _(
|
UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE = _(
|
||||||
"Unsupported direct connect mode. "
|
"*** Unsupported direct connect mode. "
|
||||||
"See http://www.mobileread.com/forums/showthread.php?t=118559 "
|
"See http://www.mobileread.com/forums/showthread.php?t=118559 "
|
||||||
"for instructions on using 'Connect to iTunes'")
|
"for instructions on using 'Connect to iTunes' ***")
|
||||||
ITUNES_SANDBOX_LOCKOUT_MESSAGE = _(
|
ITUNES_SANDBOX_LOCKOUT_MESSAGE = _(
|
||||||
'<p>Unable to communicate with iTunes.</p>'
|
'<p>Unable to communicate with iTunes.</p>'
|
||||||
'<p>Refer to this '
|
'<p>Refer to this '
|
||||||
@ -818,7 +818,7 @@ class ITUNES(DriverBase):
|
|||||||
if DEBUG:
|
if DEBUG:
|
||||||
logger().info("%s.get_device_information()" % self.__class__.__name__)
|
logger().info("%s.get_device_information()" % self.__class__.__name__)
|
||||||
|
|
||||||
return (self.sources['iPod'], 'hw v1.0', 'sw v1.0', 'mime type normally goes here')
|
return (self.sources['iPod'], 'hw v1.0', 'sw v1.0', 'unknown mime type')
|
||||||
|
|
||||||
def get_file(self, path, outfile, end_session=True):
|
def get_file(self, path, outfile, end_session=True):
|
||||||
'''
|
'''
|
||||||
@ -871,13 +871,14 @@ class ITUNES(DriverBase):
|
|||||||
product_id
|
product_id
|
||||||
))
|
))
|
||||||
|
|
||||||
|
if False:
|
||||||
# Display a dialog recommending using 'Connect to iTunes' if user hasn't
|
# Display a dialog recommending using 'Connect to iTunes' if user hasn't
|
||||||
# previously disabled the dialog
|
# previously disabled the dialog
|
||||||
if dynamic.get(confirm_config_name(self.DISPLAY_DISABLE_DIALOG), True):
|
if dynamic.get(confirm_config_name(self.DISPLAY_DISABLE_DIALOG), True):
|
||||||
raise AppleOpenFeedback(self)
|
raise AppleOpenFeedback(self)
|
||||||
else:
|
else:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
logger().error(" %s" % self.UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE)
|
logger().info(" %s" % self.UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE)
|
||||||
|
|
||||||
# Log supported DEVICE_IDs and BCDs
|
# Log supported DEVICE_IDs and BCDs
|
||||||
logger().info(" BCD: %s" % ['0x%x' % x for x in sorted(self.BCD)])
|
logger().info(" BCD: %s" % ['0x%x' % x for x in sorted(self.BCD)])
|
||||||
@ -1027,6 +1028,10 @@ class ITUNES(DriverBase):
|
|||||||
self.plugboards = plugboards
|
self.plugboards = plugboards
|
||||||
self.plugboard_func = pb_func
|
self.plugboard_func = pb_func
|
||||||
|
|
||||||
|
def shutdown(self):
|
||||||
|
if DEBUG:
|
||||||
|
logger().info("%s.shutdown()\n" % self.__class__.__name__)
|
||||||
|
|
||||||
def sync_booklists(self, booklists, end_session=True):
|
def sync_booklists(self, booklists, end_session=True):
|
||||||
'''
|
'''
|
||||||
Update metadata on device.
|
Update metadata on device.
|
||||||
@ -1125,6 +1130,7 @@ class ITUNES(DriverBase):
|
|||||||
metadata[i].uuid))
|
metadata[i].uuid))
|
||||||
self.cached_books[this_book.path] = {
|
self.cached_books[this_book.path] = {
|
||||||
'author': authors_to_string(metadata[i].authors),
|
'author': authors_to_string(metadata[i].authors),
|
||||||
|
'authors': metadata[i].authors,
|
||||||
'dev_book': db_added,
|
'dev_book': db_added,
|
||||||
'format': format,
|
'format': format,
|
||||||
'lib_book': lb_added,
|
'lib_book': lb_added,
|
||||||
@ -1171,6 +1177,7 @@ class ITUNES(DriverBase):
|
|||||||
metadata[i].uuid))
|
metadata[i].uuid))
|
||||||
self.cached_books[this_book.path] = {
|
self.cached_books[this_book.path] = {
|
||||||
'author': authors_to_string(metadata[i].authors),
|
'author': authors_to_string(metadata[i].authors),
|
||||||
|
'authors': metadata[i].authors,
|
||||||
'dev_book': db_added,
|
'dev_book': db_added,
|
||||||
'format': format,
|
'format': format,
|
||||||
'lib_book': lb_added,
|
'lib_book': lb_added,
|
||||||
@ -1388,21 +1395,18 @@ class ITUNES(DriverBase):
|
|||||||
db_added = None
|
db_added = None
|
||||||
lb_added = None
|
lb_added = None
|
||||||
|
|
||||||
|
if self.manual_sync_mode:
|
||||||
|
'''
|
||||||
|
DC mode. Add to iBooks only.
|
||||||
|
'''
|
||||||
|
db_added = self._add_device_book(fpath, metadata)
|
||||||
|
else:
|
||||||
# If using iTunes_local_storage, copy the file, redirect iTunes to use local copy
|
# If using iTunes_local_storage, copy the file, redirect iTunes to use local copy
|
||||||
if not self.settings().extra_customization[self.USE_ITUNES_STORAGE]:
|
if not self.settings().extra_customization[self.USE_ITUNES_STORAGE]:
|
||||||
local_copy = os.path.join(self.iTunes_local_storage, str(metadata.uuid) + os.path.splitext(fpath)[1])
|
local_copy = os.path.join(self.iTunes_local_storage, str(metadata.uuid) + os.path.splitext(fpath)[1])
|
||||||
shutil.copyfile(fpath, local_copy)
|
shutil.copyfile(fpath, local_copy)
|
||||||
fpath = local_copy
|
fpath = local_copy
|
||||||
|
|
||||||
if self.manual_sync_mode:
|
|
||||||
'''
|
|
||||||
Unsupported direct-connect mode.
|
|
||||||
'''
|
|
||||||
db_added = self._add_device_book(fpath, metadata)
|
|
||||||
lb_added = self._add_library_book(fpath, metadata)
|
|
||||||
if not lb_added and DEBUG:
|
|
||||||
logger().warn(" failed to add '%s' to iTunes, iTunes Media folder inaccessible" % metadata.title)
|
|
||||||
else:
|
|
||||||
lb_added = self._add_library_book(fpath, metadata)
|
lb_added = self._add_library_book(fpath, metadata)
|
||||||
if not lb_added:
|
if not lb_added:
|
||||||
raise UserFeedback("iTunes Media folder inaccessible",
|
raise UserFeedback("iTunes Media folder inaccessible",
|
||||||
@ -2336,6 +2340,7 @@ class ITUNES(DriverBase):
|
|||||||
except:
|
except:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
logger().info(" no books in library")
|
logger().info(" no books in library")
|
||||||
|
|
||||||
self.library_orphans = library_orphans
|
self.library_orphans = library_orphans
|
||||||
return library_books
|
return library_books
|
||||||
|
|
||||||
@ -2435,13 +2440,13 @@ class ITUNES(DriverBase):
|
|||||||
as_binding = "dynamic"
|
as_binding = "dynamic"
|
||||||
try:
|
try:
|
||||||
# Try dynamic binding - works with iTunes <= 10.6.1
|
# Try dynamic binding - works with iTunes <= 10.6.1
|
||||||
foo = self.iTunes.name()
|
self.iTunes.name()
|
||||||
except:
|
except:
|
||||||
# Try static binding
|
# Try static binding
|
||||||
import itunes
|
import itunes
|
||||||
self.iTunes = appscript.app('iTunes', terms=itunes)
|
self.iTunes = appscript.app('iTunes', terms=itunes)
|
||||||
try:
|
try:
|
||||||
foo = self.iTunes.name()
|
self.iTunes.name()
|
||||||
as_binding = "static"
|
as_binding = "static"
|
||||||
except:
|
except:
|
||||||
self.iTunes = None
|
self.iTunes = None
|
||||||
@ -2494,8 +2499,8 @@ class ITUNES(DriverBase):
|
|||||||
self.iTunes = win32com.client.Dispatch("iTunes.Application")
|
self.iTunes = win32com.client.Dispatch("iTunes.Application")
|
||||||
except:
|
except:
|
||||||
self.iTunes = None
|
self.iTunes = None
|
||||||
raise UserFeedback(' %s._launch_iTunes(): unable to find installed iTunes'
|
raise OpenFeedback('Unable to launch iTunes.\n' +
|
||||||
% self.__class__.__name__, details=None, level=UserFeedback.WARN)
|
'Try launching calibre as Administrator')
|
||||||
|
|
||||||
if not DEBUG:
|
if not DEBUG:
|
||||||
self.iTunes.Windows[0].Minimized = True
|
self.iTunes.Windows[0].Minimized = True
|
||||||
@ -2503,8 +2508,7 @@ class ITUNES(DriverBase):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Pre-emptive test to confirm functional iTunes automation interface
|
# Pre-emptive test to confirm functional iTunes automation interface
|
||||||
foo = self.iTunes.Version
|
logger().info(" automation interface with iTunes %s established" % self.iTunes.Version)
|
||||||
foo
|
|
||||||
except:
|
except:
|
||||||
self.iTunes = None
|
self.iTunes = None
|
||||||
raise OpenFeedback('Unable to connect to iTunes.\n' +
|
raise OpenFeedback('Unable to connect to iTunes.\n' +
|
||||||
@ -2547,7 +2551,6 @@ class ITUNES(DriverBase):
|
|||||||
'''
|
'''
|
||||||
PURGE_ORPHANS = False
|
PURGE_ORPHANS = False
|
||||||
|
|
||||||
if PURGE_ORPHANS:
|
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
logger().info(" %s._purge_orphans()" % self.__class__.__name__)
|
logger().info(" %s._purge_orphans()" % self.__class__.__name__)
|
||||||
#self._dump_library_books(library_books)
|
#self._dump_library_books(library_books)
|
||||||
@ -2557,6 +2560,7 @@ class ITUNES(DriverBase):
|
|||||||
if isosx:
|
if isosx:
|
||||||
if book not in cached_books and \
|
if book not in cached_books and \
|
||||||
str(library_books[book].description()).startswith(self.description_prefix):
|
str(library_books[book].description()).startswith(self.description_prefix):
|
||||||
|
if PURGE_ORPHANS:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
logger().info(" '%s' not found on iDevice, removing from iTunes" % book)
|
logger().info(" '%s' not found on iDevice, removing from iTunes" % book)
|
||||||
btr = {
|
btr = {
|
||||||
@ -2564,9 +2568,14 @@ class ITUNES(DriverBase):
|
|||||||
'author': library_books[book].artist(),
|
'author': library_books[book].artist(),
|
||||||
'lib_book': library_books[book]}
|
'lib_book': library_books[book]}
|
||||||
self._remove_from_iTunes(btr)
|
self._remove_from_iTunes(btr)
|
||||||
|
else:
|
||||||
|
if DEBUG:
|
||||||
|
logger().info(" '%s' found in iTunes, but not on iDevice" % (book))
|
||||||
|
|
||||||
elif iswindows:
|
elif iswindows:
|
||||||
if book not in cached_books and \
|
if book not in cached_books and \
|
||||||
library_books[book].Description.startswith(self.description_prefix):
|
library_books[book].Description.startswith(self.description_prefix):
|
||||||
|
if PURGE_ORPHANS:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
logger().info(" '%s' not found on iDevice, removing from iTunes" % book)
|
logger().info(" '%s' not found on iDevice, removing from iTunes" % book)
|
||||||
btr = {
|
btr = {
|
||||||
@ -2576,7 +2585,7 @@ class ITUNES(DriverBase):
|
|||||||
self._remove_from_iTunes(btr)
|
self._remove_from_iTunes(btr)
|
||||||
else:
|
else:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
logger().info(" %s._purge_orphans(disabled)" % self.__class__.__name__)
|
logger().info(" '%s' found in iTunes, but not on iDevice" % (book))
|
||||||
|
|
||||||
def _remove_existing_copy(self, path, metadata):
|
def _remove_existing_copy(self, path, metadata):
|
||||||
'''
|
'''
|
||||||
@ -3107,11 +3116,10 @@ class ITUNES(DriverBase):
|
|||||||
|
|
||||||
def _wait_for_writable_metadata(self, db_added, delay=2.0):
|
def _wait_for_writable_metadata(self, db_added, delay=2.0):
|
||||||
'''
|
'''
|
||||||
Ensure iDevice metadata is writable. Direct connect mode only
|
Ensure iDevice metadata is writable. DC mode only
|
||||||
'''
|
'''
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
logger().info(" %s._wait_for_writable_metadata()" % self.__class__.__name__)
|
logger().info(" %s._wait_for_writable_metadata()" % self.__class__.__name__)
|
||||||
logger().warning(" %s" % self.UNSUPPORTED_DIRECT_CONNECT_MODE_MESSAGE)
|
|
||||||
|
|
||||||
attempts = 9
|
attempts = 9
|
||||||
while attempts:
|
while attempts:
|
||||||
|
@ -41,6 +41,20 @@ class N516(USBMS):
|
|||||||
def can_handle(self, device_info, debug=False):
|
def can_handle(self, device_info, debug=False):
|
||||||
return not is_alex(device_info)
|
return not is_alex(device_info)
|
||||||
|
|
||||||
|
class KIBANO(N516):
|
||||||
|
|
||||||
|
name = 'Kibano driver'
|
||||||
|
gui_name = 'Kibano'
|
||||||
|
description = _('Communicate with the Kibano eBook reader.')
|
||||||
|
FORMATS = ['epub', 'pdf', 'txt']
|
||||||
|
BCD = [0x323]
|
||||||
|
|
||||||
|
VENDOR_NAME = 'EBOOK'
|
||||||
|
# We use EXTERNAL_SD_CARD for main mem as some devices have not working
|
||||||
|
# main memories
|
||||||
|
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['INTERNAL_SD_CARD',
|
||||||
|
'EXTERNAL_SD_CARD']
|
||||||
|
|
||||||
class THEBOOK(N516):
|
class THEBOOK(N516):
|
||||||
name = 'The Book driver'
|
name = 'The Book driver'
|
||||||
gui_name = 'The Book'
|
gui_name = 'The Book'
|
||||||
|
@ -199,6 +199,11 @@ class KTCollectionsBookList(CollectionsBookList):
|
|||||||
('series' in collection_attributes and
|
('series' in collection_attributes and
|
||||||
book.get('series', None) == category):
|
book.get('series', None) == category):
|
||||||
is_series = True
|
is_series = True
|
||||||
|
|
||||||
|
# The category should not be None, but, it has happened.
|
||||||
|
if not category:
|
||||||
|
continue
|
||||||
|
|
||||||
cat_name = category.strip(' ,')
|
cat_name = category.strip(' ,')
|
||||||
|
|
||||||
if cat_name not in collections:
|
if cat_name not in collections:
|
||||||
|
@ -37,7 +37,7 @@ class KOBO(USBMS):
|
|||||||
|
|
||||||
dbversion = 0
|
dbversion = 0
|
||||||
fwversion = 0
|
fwversion = 0
|
||||||
supported_dbversion = 65
|
supported_dbversion = 75
|
||||||
has_kepubs = False
|
has_kepubs = False
|
||||||
|
|
||||||
supported_platforms = ['windows', 'osx', 'linux']
|
supported_platforms = ['windows', 'osx', 'linux']
|
||||||
@ -1537,7 +1537,11 @@ class KOBOTOUCH(KOBO):
|
|||||||
return bookshelves
|
return bookshelves
|
||||||
|
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
query = "select ShelfName from ShelfContent where ContentId = ? and _IsDeleted = 'false'"
|
query = "select ShelfName " \
|
||||||
|
"from ShelfContent " \
|
||||||
|
"where ContentId = ? " \
|
||||||
|
"and _IsDeleted = 'false' " \
|
||||||
|
"and ShelfName is not null" # This should never be nulll, but it is protection against an error cause by a sync to the Kobo server
|
||||||
values = (ContentID, )
|
values = (ContentID, )
|
||||||
cursor.execute(query, values)
|
cursor.execute(query, values)
|
||||||
for i, row in enumerate(cursor):
|
for i, row in enumerate(cursor):
|
||||||
|
@ -13,6 +13,7 @@ from itertools import izip
|
|||||||
|
|
||||||
from calibre import prints
|
from calibre import prints
|
||||||
from calibre.constants import iswindows, numeric_version
|
from calibre.constants import iswindows, numeric_version
|
||||||
|
from calibre.devices.errors import PathError
|
||||||
from calibre.devices.mtp.base import debug
|
from calibre.devices.mtp.base import debug
|
||||||
from calibre.devices.mtp.defaults import DeviceDefaults
|
from calibre.devices.mtp.defaults import DeviceDefaults
|
||||||
from calibre.ptempfile import SpooledTemporaryFile, PersistentTemporaryDirectory
|
from calibre.ptempfile import SpooledTemporaryFile, PersistentTemporaryDirectory
|
||||||
@ -23,6 +24,12 @@ from calibre.utils.filenames import shorten_components_to
|
|||||||
BASE = importlib.import_module('calibre.devices.mtp.%s.driver'%(
|
BASE = importlib.import_module('calibre.devices.mtp.%s.driver'%(
|
||||||
'windows' if iswindows else 'unix')).MTP_DEVICE
|
'windows' if iswindows else 'unix')).MTP_DEVICE
|
||||||
|
|
||||||
|
class MTPInvalidSendPathError(PathError):
|
||||||
|
|
||||||
|
def __init__(self, folder):
|
||||||
|
PathError.__init__(self, 'Trying to send to ignored folder: %s'%folder)
|
||||||
|
self.folder = folder
|
||||||
|
|
||||||
class MTP_DEVICE(BASE):
|
class MTP_DEVICE(BASE):
|
||||||
|
|
||||||
METADATA_CACHE = 'metadata.calibre'
|
METADATA_CACHE = 'metadata.calibre'
|
||||||
@ -46,6 +53,7 @@ class MTP_DEVICE(BASE):
|
|||||||
self._prefs = None
|
self._prefs = None
|
||||||
self.device_defaults = DeviceDefaults()
|
self.device_defaults = DeviceDefaults()
|
||||||
self.current_device_defaults = {}
|
self.current_device_defaults = {}
|
||||||
|
self.highlight_ignored_folders = False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def prefs(self):
|
def prefs(self):
|
||||||
@ -59,9 +67,25 @@ class MTP_DEVICE(BASE):
|
|||||||
p.defaults['blacklist'] = []
|
p.defaults['blacklist'] = []
|
||||||
p.defaults['history'] = {}
|
p.defaults['history'] = {}
|
||||||
p.defaults['rules'] = []
|
p.defaults['rules'] = []
|
||||||
|
p.defaults['ignored_folders'] = {}
|
||||||
|
|
||||||
return self._prefs
|
return self._prefs
|
||||||
|
|
||||||
|
def is_folder_ignored(self, storage_or_storage_id, name,
|
||||||
|
ignored_folders=None):
|
||||||
|
storage_id = unicode(getattr(storage_or_storage_id, 'object_id',
|
||||||
|
storage_or_storage_id))
|
||||||
|
name = icu_lower(name)
|
||||||
|
if ignored_folders is None:
|
||||||
|
ignored_folders = self.get_pref('ignored_folders')
|
||||||
|
if storage_id in ignored_folders:
|
||||||
|
return name in {icu_lower(x) for x in ignored_folders[storage_id]}
|
||||||
|
|
||||||
|
return name in {
|
||||||
|
'alarms', 'android', 'dcim', 'movies', 'music', 'notifications',
|
||||||
|
'pictures', 'ringtones', 'samsung', 'sony', 'htc', 'bluetooth',
|
||||||
|
'games', 'lost.dir', 'video', 'whatsapp', 'image'}
|
||||||
|
|
||||||
def configure_for_kindle_app(self):
|
def configure_for_kindle_app(self):
|
||||||
proxy = self.prefs
|
proxy = self.prefs
|
||||||
with proxy:
|
with proxy:
|
||||||
@ -371,6 +395,8 @@ class MTP_DEVICE(BASE):
|
|||||||
|
|
||||||
for infile, fname, mi in izip(files, names, metadata):
|
for infile, fname, mi in izip(files, names, metadata):
|
||||||
path = self.create_upload_path(prefix, mi, fname, routing)
|
path = self.create_upload_path(prefix, mi, fname, routing)
|
||||||
|
if path and self.is_folder_ignored(storage, path[0]):
|
||||||
|
raise MTPInvalidSendPathError(path[0])
|
||||||
parent = self.ensure_parent(storage, path)
|
parent = self.ensure_parent(storage, path)
|
||||||
if hasattr(infile, 'read'):
|
if hasattr(infile, 'read'):
|
||||||
pos = infile.tell()
|
pos = infile.tell()
|
||||||
@ -472,7 +498,7 @@ class MTP_DEVICE(BASE):
|
|||||||
|
|
||||||
def config_widget(self):
|
def config_widget(self):
|
||||||
from calibre.gui2.device_drivers.mtp_config import MTPConfig
|
from calibre.gui2.device_drivers.mtp_config import MTPConfig
|
||||||
return MTPConfig(self)
|
return MTPConfig(self, highlight_ignored_folders=self.highlight_ignored_folders)
|
||||||
|
|
||||||
def save_settings(self, cw):
|
def save_settings(self, cw):
|
||||||
cw.commit()
|
cw.commit()
|
||||||
|
@ -239,12 +239,12 @@ class TestDeviceInteraction(unittest.TestCase):
|
|||||||
|
|
||||||
# Test get_filesystem
|
# Test get_filesystem
|
||||||
used_by_one = self.measure_memory_usage(1,
|
used_by_one = self.measure_memory_usage(1,
|
||||||
self.dev.dev.get_filesystem, self.storage.object_id, lambda x:
|
self.dev.dev.get_filesystem, self.storage.object_id,
|
||||||
x)
|
lambda x, l:True)
|
||||||
|
|
||||||
used_by_many = self.measure_memory_usage(5,
|
used_by_many = self.measure_memory_usage(5,
|
||||||
self.dev.dev.get_filesystem, self.storage.object_id, lambda x:
|
self.dev.dev.get_filesystem, self.storage.object_id,
|
||||||
x)
|
lambda x, l: True)
|
||||||
|
|
||||||
self.check_memory(used_by_one, used_by_many,
|
self.check_memory(used_by_one, used_by_many,
|
||||||
'Memory consumption during get_filesystem')
|
'Memory consumption during get_filesystem')
|
||||||
|