diff --git a/.bzrignore b/.bzrignore index f14ff947f6..6b6450f1f9 100644 --- a/.bzrignore +++ b/.bzrignore @@ -37,7 +37,9 @@ nbproject/ calibre_plugins/ recipes/.git recipes/.gitignore -recipes/README +recipes/README.md +recipes/icon_checker.py +recipes/readme_updater.py recipes/katalog_egazeciarz.recipe recipes/tv_axnscifi.recipe recipes/tv_comedycentral.recipe @@ -60,6 +62,7 @@ recipes/tv_tvpkultura.recipe recipes/tv_tvppolonia.recipe recipes/tv_tvpuls.recipe recipes/tv_viasathistory.recipe +recipes/icons/katalog_egazeciarz.png recipes/icons/tv_axnscifi.png recipes/icons/tv_comedycentral.png recipes/icons/tv_discoveryscience.png diff --git a/Changelog.old.yaml b/Changelog.old.yaml index 48382ab452..4910f4f849 100644 --- a/Changelog.old.yaml +++ b/Changelog.old.yaml @@ -1,3 +1,1682 @@ + +- version: 0.8.53 + date: 2012-05-25 + + new features: + - title: "Kindle Touch/4 driver: Upload cover thumbnails when sending books to device by USB to workaround Amazon bug of not displaying covers for sync-enabled books" + + - title: "Support for updating metadata in FB2 files" + + - title: "Set a different background color when choosing formats to not delete as opposed to choosing format to delete." + tickets: [ 1001741 ] + + - title: "E-book viewer: Add an option to prevent the up and down arrow keys from scrolling past page breaks" + + - title: "Get Books: Remove ebookshoppe.com at the website's request" + + bug fixes: + - title: "PDF Input: Support image rotation commands in PDF files. Fixes the long standing problem of some images being flipped when converting from PDF in calibre." + + - title: "Fix a regression in 0.8.51 that caused conversion to HTMLZ to not have any CSS" + + - title: "Get Books: Fix website change at kobo.com causing prices not to be found" + + - title: "Edit the time in the 24 hour clock when calibre's interface language is set to German." + tickets: [ 1001809 ] + + - title: "MOBI Output: When generating joint KF8/MOBI6 .mobi files set the text length field in the MOBI 6 header correctly. " + tickets: [ 1003489 ] + + - title: "ODT Input: More workarounds for LibreOffice 3.5's habit of inserting pointless margin:100% directives everywhere." + tickets: [ 1002702 ] + + - title: "Fix regression that broke smarten punctuation when quotes were next to html tags." + tickets: [ 998900 ] + + - title: "Fix published date from ozon.ru wrong in some timezones" + tickets: [ 975338 ] + + - title: "Catalogs: Handle the use of custom columns with non-ascii names correctly" + tickets: [1001437] + + - title: "Conversion pipeline: Remove the attempt to detect and autocorrect if text will go off the left edge of the page, as it was a rather crude heuristic. Also do not remove fake margins if the book uses negative text indents on the margined elements." + + - title: "KF8 Output: Set offsets to tags in the skeleton the same way kindlegen does. Also linearize non linear ToCs to ensure section to section jumping works." + + - title: "Conversion pipeline: Use correct default value of 'inherit' for font-family and font-size when normalizing the shorthand font property." + + - title: "When running python scripts via calibre-debug ensure that user plugins are loaded" + + improved recipes: + - Business Week Magazine + - Metro Nieuws NL + + new recipes: + - title: Attac.es + author: Marc Busque + + - title: Drytooling.com + author: Damian Granowski + + - title: Shortlist.com + author: Dave ASbury + + - title: National Geographic (es) + author: vakya + +- version: 0.8.52 + date: 2012-05-18 + + new features: + - title: "EPUB Input: When setting the cover for a book that identifies its cover image, but not the html wrapper around the cover, try to detect and remove that wrapper automatically." + tickets: [ 999959 ] + + - title: "When deleting books of a specific format, show the number of books with each format available" + + - title: "Linux install: No longer create MAN pages as all utilities have more comprehensive command line --help anyway" + + - title: "Add a tweak Preferences->Tweaks to control the default choice of format for the Tweak Book feature" + + - title: "Conversion: Allow setting negative page margins. A negative page margin means that calibre will not specify any page margin in the output document (for formats that support this)" + + bug fixes: + - title: "Tweak book: Fix handling of covers when tweaking KF8 books" + + - title: "KF8 Output: Handle input documents with out of sequence ToC entries. Note that currently section jumping in the KF8 output produced by calibre for such files does not work." + tickets: [1000493] + + - title: "Edit metadata dialog: Fix the edit values button for custom tag-like columns showing a unneeded warning about changed values" + + - title: "EPUB Output: Be a little more conservative when removing
tags. Only remove them if they have actual forms inside. " + tickets: [ 1000384 ] + + - title: "EPUB Input: Correctly update the Cover entry in the ToC even when the entry has a fragment reference. " + tickets: [ 999973 ] + + - title: "Update ImagMagick DLLs in all calibre binary builds to fix security vulnerabilities in ImageMagick" + tickets: [ 999496 ] + + - title: "Advanced search dialog: Fix equals and regex matching not being applied for custom column searches." + tickets: [ 980221 ] + + - title: "RTF Input: Handle old RTF files that have commands without braces." + tickets: [ 994133 ] + + - title: "Get Books: Diesel, fix results not showing when only a single match is found" + + - title: "Get Books: Fix DRM status indicators for Kobo and Diesel stores. Fix smashwords not returning results." + tickets: [ 993755 ] + + - title: "Fix regression in 0.8.51 that broke viewing of LIT and some EPUB files" + tickets: [998248, 998216] + + improved recipes: + - Clarin + - Spiegel + - Spiegel International + - Montreal Gazette + - Gosc Niedzelny + - Ars Technica + + new recipes: + - title: "Army/Navy/Air force/Marine Times and News busters" + author: jde + + - title: "Ads of the World, Heavy Meta (Italian) and Juve La Stampa" + author: faber1971 + + - title: "Revista Summa" + author: Vakya + + - title: "Strategic culture" + author: Darko Miletic + + - title: Stars and Stripes + author: adoucette + + - title: Nackdenkseiten + author: jrda + + +- version: 0.8.51 + date: 2012-05-11 + + new features: + - title: "When switching libraries preserve the position and selected books if you switch back to a previously opened library." + tickets: [994514] + + - title: "Conversion pipeline: Filter out the useless font-face rules inserted by Microsoft Word for every font on the system" + + - title: "Driver for Motorola XT875 and Pandigital SuperNova" + tickets: [996890] + + - title: "Add a colour swatch the the dialog for creating column coloring rules, to ease selection of colors" + tickets: [994811] + + - title: "EPUB Output: Consolidate internal CSS generated by calibre into external stylesheets for ease of editing the EPUB" + + - title: "List EPUB and MOBI at the top of the dropdown list fo formats to convert to, as they are the most common choices" + tickets: [994838] + + bug fixes: + - title: "E-book viewer: Improve performance when switching between normal and fullscreen views." + tickets: [996102] + + - title: "Edit metadata dialog: When running download metadata do not insert duplicate tags into the list of tags" + + - title: "KF8 Input: Do not error out if the file has a few invalidly encoded bytes." + tickets: [997034] + + - title: "Fix download of news in AZW3 format not working" + tickets: [996439] + + - title: "Pocketbook driver: Update for new PB 611 firmware." + tickets: [903079] + + - title: "ebook-convert: Error out if the user prvides extra command line args instead of silently ignoring them" + tickets: [994939] + + - title: "EPUB Output: Do not self close any container tags to prevent artifacts when EPUBs are viewed using buggy browser based viewers." + tickets: [994861] + + - title: "Fix regression in 0.8.50 that broke the conversion of HTML files that contained non-ascii font-face declarations, typically produced by Microsoft Word" + + improved recipes: + - Mainichi news + - derStandard + - Endgadget Japan + + new recipes: + - title: Mainichi English + author: Hiroshi Miura + + - title: The Grid TO + author: Yusuf W + + - title: National Geographic (Italy) + author: faber1971 + + - title: Rebelion + author: Marc Busque + +- version: 0.8.50 + date: 2012-05-04 + + new features: + - title: "Tweak Book: Allow tweaking of KF8 MOBI files. Useful to fine-tune the result of a conversion. Right click on the book and select Tweak Book to use the feature. Note that tweaking a MOBI file that contains both KF8 and older MOBI6 will cause the MOBI6 version to be discarded." + + - title: "AZW3 output plugin. This output plugin generates pure KF8 mobi files. These only work on the Kindle Fire and Kindle Touch with latest firmware." + + - title: "Conversion: Allow easy re-ordering of the search and replace expressions in the conversion dialog. Also apply the expressions in the same order that they were entered when doing the conversion." + + - title: "Automatically add the Tag 'Sample Book' when an Amazon sample is added to calibre" + + - title: "FB2 Input: Better handling of inline images." + tickets: [989869] + + bug fixes: + - title: "KF8 Output: Fix section to section jumps not working for documents with multi-level ToCs" + + - title: "EPUB Input: Handle the case of the metadata ToC containing a reference to the cover HTML file." + tickets: [993812] + + - title: "CHM Input: Handle files with deeply nested markup and non html files listed at the start of the manifest." + tickets: [993607] + + - title: "KF8 Output: Workaround Kindle Touch bug that causes the book to be rendered as black pages when a height is specified for " + + - title: "Fix regression in 0.8.49 that broke italics detection in heuristic processing on 32-bit systems." + tickets: [991380] + + - title: "KF8 Output: Fix joint MOBI6/KF8 books not being recognized as MOBI files by older Kindles" + + - title: "KF8 Output: Fix errors when processing documents with HTML comments and/or XML processing instructions" + + - title: "Get Books: Amazon fix prices not being found. B&N fix details link. ebooks.com: fix cover image. Website changes to various EU stores" + + - title: "FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded." + tickets: [990929] + + - title: "Fix scrolling with the cover browser updating only the selection in the book list, not the current book." + tickets: [990881] + + - title: "Save to Disk: Do not run out memory when saving very large files on systems with low RAM." + tickets: [990741] + + - title: "FB2 Output: Use 2 letter language codes in preference to 3-letter ones to not break poorly implemented FB2 readers" + tickets: [990026] + + - title: "EPUB Input: Auto set the media-type for OPF manifest entries with an empty media-type" + + improved recipes: + - National Post + - Daily Mirror + - Sun + - Newsweek Polska + - Max-Planck + - derStandard + - tweakers.net + + new recipes: + - title: George Monbiot + author: Darko Miletic + + - title: El Mundo + author: atordo + + - title: AraInfo and Diagonal + author: Ruben Pollan + + +- version: 0.8.49 + date: 2012-04-27 + + new features: + - title: "Experimental support for generating Amazon's new KF8 format MOBI files" + description: "calibre can now generate Amazon's new KF8 format MOBI files. + To turn on this feature, go to Preferences->Tweaks and click Plugin Tweaks. In the box add: + test_mobi_output_type = 'both' + calibre will now produce MOBI files that have both the old MOBI format and the new KF8 format in them. + To learn more about KF8, see: http://www.amazon.com/gp/feature.html?docId=1000729511 + Note that calibre support for KF8 is still experimental and there will likely be bugs." + + - title: "Upgrade to using cssutils 0.9.9 for CSS parsing. Improved speed and robustness." + + - title: "Show cover size in a tooltip in the conversion dialog" + tickets: [986958] + + - title: "Driver for Nook Simple Touch with Glow Light" + tickets: [989264] + + bug fixes: + - title: "Heuristics: When italicizing words do not operate on words not in between HTML tags." + tickets: [986298] + + - title: "Fix (I hope) the bulk metadata download process crashing for some people on OS X when clicking the Yes button to apply the updates." + tickets: [986658] + + - title: "Fix tooltip not being updated in the book details panel when pasting in a new cover" + tickets: [986958] + + - title: "Cover Browser: Wrap the title on space only, not in between words." + tickets: [986516] + + - title: "Edit metadata dialog: If a permission denied error occurs when clicking the next or prev buttons, stay on the current book." + tickets: [986903] + + - title: "Fix heuristics not removing unnecessary hyphens from the end of lines." + tickets: [822744] + + improved recipes: + - Metro Nieuws NL + - Der Tagesspiegel + + new recipes: + - title: Berria + author: Alayn Gortazar + + - title: Sol Haber + author: Onur Gungor + + - title: Telam + author: Darko Miletic + + - title: Richmond Times-Dispatch + author: jde + +- version: 0.8.48 + date: 2012-04-20 + + new features: + - title: "Conversion: The search and replace feature has been completely revamped." + description: "You can now use any number of search and replace + expression, not just three. You can also store and load frequently used + sets of search and replace expressions. Also, the wizard generates its + preview in a separate process to protect against crashes/memory leaks." + tickets: [983476,983484,983478] + + - title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free." + + - title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X" + tickets: [981185] + + bug fixes: + - title: "Get Books: Support the new website design of Barnes & Noble" + + - title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted." + tickets: [943586] + + - title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'" + + - title: "MOBI Output: Handle background color specified on and in addition to tags." + tickets: [980813] + + - title: "MOBI Output: Fix underline style applied to parent element not getting inherited by children." + tickets: [985711] + + improved recipes: + - xkcd + - Metro Nieuws + - Calgary Herald + - Orlando Sentinel + - countryfile + - Heise + + new recipes: + - title: Various new Polish news sources + author: fenuks + + - title: Various Italian news sources + author: faber1971 + + - title: Jakarta Globe + author: rty + + - title: Acim Bilim Dergisi + author: thomass + +- version: 0.8.47 + date: 2012-04-13 + + new features: + - title: "Conversion pipeline: Add support for all the named entities in the HTML 5 spec." + tickets: [976056] + + - title: "Support for viewing and converting the Haodoo PDB ebook format" + tickets: [976478] + + - title: "Device driver for Laser EB720" + + bug fixes: + - title: "Fix regression in automatic adding in 0.8.46 that broke automatic adding if adding of duplicates is enabled and auto convert is also enabled" + tickets: [976336] + + - title: 'Fix "Tags" field in advanced search does not obey regex setting' + tickets: [980221] + + - title: "EPUB Input: Automatically extract cover image from simple HTML title page that consists of only a single tag, instead of rendering the page" + + - title: "Prevent errors when both author and author_sort are used in a template for reading metadata from filenames for files on a device" + + - title: "Amazon metadata download: Handle books whose titles start with a bracket." + tickets: [976365] + + - title: "Get Books: Fix downloading of purchased books from Baen" + tickets: [975929] + + + improved recipes: + - Forbes + - Caros Amigos + - Trouw + - Sun UK + - Metro + - Daily Mirror + + new recipes: + - title: "Melbourne Herald Sun" + author: Ray Hartley + + - title: "Editoriali and Zerocalcare" + author: faber1971 + +- version: 0.8.46 + date: 2012-04-06 + + new features: + - title: "Auto adding: When automatically adding files from a folder, automatically convert the files to the current output format after adding. This can be turned off via Preferences->Adding Books->Automatic Adding." + tickets: [969053] + + - title: "E-book viewer: When reading a MOBI file that is actually a KF8 book, show the format as being KF8" + + - title: "Content server: Workaround for android stock browser not support HTTP AUTH." + + - title: "Edit metadata dialog: Change the remove unused series button to a clear series button (as the remove unused series function is now automatic)" + + - title: "Driver for PocketBook 622." + tickets: [969875] + + bug fixes: + - title: "Run metadata downloads in a separate process to workaround memory leaks in third party plugins. Also removes the need to break up bulk metadata downloads into 100 book batches." + + - title: "Make tag browser filtering work when capital letters are entered." + + - title: "EPUB metadata: Ignore urn:isbn: prefix from ISBN declaration when reading metadata" + + - title: "Get books: Fix feedbooks store not showing all available formats" + + - title: "KF8 Input: When the KF8 book has no metadata ToC, try to extract the ToC from the HTML instead." + tickets: [969238] + + - title: "Fix regression that broke access to Preferences via the Preferences item in the calibre menu on OS X" + tickets: [969418] + + - title: "Fix bug that ignored metadata specified on the command line when using calibredb add" + + improved recipes: + - OReilly Premium + - Real Clear + - Soldier's Magazine + - Rue89 + + new recipes: + - title: The Southern Star + author: watou + + - title: Buenos Aires Herald + author: Darko Miletic + +- version: 0.8.45 + date: 2012-03-30 + + new features: + - title: "E-book viewer: Allow the up and down keys to scroll past section boundaries" + + - title: "calibredb: Allow specification of basic metadata on the command line when adding books." + tickets: [951063] + + - title: "Driver for Samsung Galaxy Plus GT-I9001" + + - title: "KF8 Input: Support KF8 format Amazon book samples." + tickets: [963418] + + - title: "When a new plugin is added to calibre for the first time, have its icon (if any) show up even when a device is connected (this can be changed by the user at the time of plugin installation)" + + - title: "Add keyboard shortcuts for Bold, Italic and Underline to the comments editor in the edit metadata dialog" + tickets: [963559] + + bug fixes: + - title: "E-book viewer: Fix last read position (and bookmarks in general) being inaccurate for some books." + description: "The technique for marking locations in books used by the viewer has changed. The new technique should be much more accurate than the last one, especially when the font size at which the book is being viewed is changed. Note that this change means that bookmarks created with this release of calibre will not be read by previous calibre versions. On a technical note, the viewer now uses the CFI specification from the EPUB 3 standard for bookmarks." + type: major + + - title: "Workarounds for a few regressions in the user interface in 0.8.44 caused by the update to Qt 4.8.0" + + - title: "Books list: Preserve the horizontal scroll position when sorting by a column" + + - title: "Fix saving to disk and then adding the book back not restoring tags-like custom columns" + + - title: "Linux installer: Fix completion for ebook-convert not working." + tickets: [967834] + + - title: "MOBI Output: Recognize type=text in addition to type=start guide elements" + + - title: "Get Books: Updates to Nexto, Ebookpoint and Woblink stores" + + - title: "Fix unable to clear username/password in Fetch news dialog" + + - title: "PDF Output: Fix margin specifications not being applied" + + - title: "Linux installer: Manually preserve the defaults.list mimetype association file to workaround buggy xdg-desktop-menu implementations in some distros." + tickets: [926559] + + - title: "E-book viewer: Fix regression that caused the ebook viewer to stop functioning if it is launched from the main calibre program and then the main calibre program is closed." + tickets: [963960] + + + improved recipes: + - Our Daily Bread + + new recipes: + - title: NRC Handelsblad (free) + author: veezh + +- version: 0.8.44 + date: 2012-03-23 + + new features: + - title: "E-book viewer: A whole new full screen mode." + description: "The new mode has no toolbars to distract from the text and the ability to set the width of the column of text via Preferences in the ebook viewer. Click the Fullscreen button on the toolbar in the viewer to enter fullscreen mode (or press the F11 or Ctrl+Shit+F keys)" + type: major + tickets: [959830] + + - title: "Copy to Library: If books were auto merged by the copy to library process, popup a message telling the user about it, as otherwise some people forget they have turned on auto merge and accuse calibre of losing their books." + + - title: "Unix driver for Ectaco JetBook color" + tickets: [958442] + + - title: "Add a link to the 'Adding Books Preferences' in the drop down menu of the Add Books button for easier access and more prominence" + tickets: [958145] + + - title: "Smarten punctuation: Add a few more cases for detecting opening and closing quotes" + + bug fixes: + - title: "Get Books: Updates to various store plugins to deal with website changes: Amazon Europe, Waterstones, Foyles, B&N, Kobo, Woblink and Empik" + + - title: "Catalog generation: Do not error out when generating csv/xml catalogs if the catalog title contains filename invalid characters." + tickets: [960154] + + - title: "RTF Output: Ignore corrupted images in the input document, instead of erroring out." + tickets: [959600] + + - title: "E-book viewer: Try to preserve page position when the window is resized" + + - title: "Fix bug that caused wrong series to be shown when clicking on the first letter of a series group in the Tag Browser" + + - title: "Fix calibre not supporting different http and https proxies." + tickets: [960173] + + - title: "MOBI Input: Fix regression caused by KF8 support that broke reading of ancient non-Amazon PRC files" + + - title: "Fix EPUB to EPUB conversion of an EPUB with obfuscated fonts resulting in the fonts not being readable in Adobe Digital Editions" + tickets: [957527] + + - title: "RTF Output: Fix bug that broke conversion to RTF when the input document contains tags with no src attribute." + + - title: "Fix regression in 0.8.43 that broke use of general mode templates that ended in a semi-colon." + tickets: [957295] + + improved recipes: + - b92 + - Various Polish news sources + - Le Monde + - FHM UK + + new recipes: + - title: Ivana Milakovic and Klub knjige + author: Darko Miletic + + +- version: 0.8.43 + date: 2012-03-16 + + new features: + - title: "Template language: Speedup evaluation of general program mode templates by pre-compiling them to python. If you experience errors with this optimization, you can turn it off via Preferences->Tweaks. Also other miscellaneous optimizations in evaluating templates with composite columns." + + - title: "MOBI Output: Add an option to not convert all images to JPEG when creating MOBI files. For maximum compatibility of the produced MOBI files, do not use this option." + tickets: [954025] + + - title: "Add iPad3 Output Profile" + + bug fixes: + - title: "KF8 Input: Add support for KF8 files with obfuscated embedded fonts" + tickets: [953260] + + - title: "Make the stars in the book list a little larger on windows >= vista" + + - title: "Revised periodical Section layout, for touchscreen devices resolving iBooks problem with tables spanning multiple pages" + + - title: "Read dc:contributor metadata from MOBI files" + + - title: "MOBI Output: Fix a regression that caused the generated thumbnail embedded in calibre produced MOBI files to be a large, low quality image instead of a small, high quality image. You would have been affected by this bug only if you directly used the output from calibre, without exporting it via send to device or save to disk." + tickets: [954254] + + - title: "KF8 Input: Recognize OpenType embedded fonts as well." + tickets: [954728] + + - title: "Fix regression in 0.8.41 that caused file:/// URLs to stop working in the news download system on windows." + tickets: [955581] + + - title: "When setting metadata in MOBI files fix cover not being updated if the mobi file has its first image record as the cover" + + - title: "Fix column coloring rules based on the size column not working" + tickets: [953737] + + improved recipes: + - Microwaves and RF + - idg.se + + new recipes: + - title: SatMagazine + author: kiavash + +- version: 0.8.42 + date: 2012-03-12 + + new features: + - title: "Support for reading Amazon's new KF8 format" + type: major + description: "calibre can now both view and convert MOBI files that contain Amazon's new KF8 (Kindle Fire) format" + + - title: "Add a tweak to Preferences->Tweaks to control the font size used in the book details panel" + tickets: [948357] + + - title: "Allow specifying a list of file types to exclude when automatically adding files from a folder" + tickets: [943025] + + - title: "Show ratings in the book details panel as stars. Also allow the user to change the alignment of the ratings column in the main books list. No longer display the stars in blue, instead their color can be customized via the column coloring rules, like any other column" + + - title: "When setting metadata in EPUB ensure that the tag has its name attribute first. Needed for the Nook." + + - title: "Drivers for Novo 7, LG G2x and Zenithink T-280" + tickets: [941671, 940625, 940527] + + - title: "Update linux binaries to Qt 4.8.0" + + bug fixes: + - title: "Fix some rar files causing crashes on OS X (updated libunrar.dylib in the OS X build)" + tickets: [951185] + + - title: "MOBI Output: Ignore the Table of Contents pointed to by the guide, if it contains no links" + + - title: "ODT Input: Ignore margin declaration in ODT styles if more specific margin-* declarations are present" + tickets: [941134] + + - title: "Conversion pipeline: Fix @import rules in CSS stylesheets that have comments on their first few lines being ignored." + + - title: "EPUB Input: When extracting the contents of epub files on windows, do not error out if one or more of the components in the epub file have filepaths containing characters that are invalid for the windows filesystem, instead, just replace those characters, since those entries are likely to be errors in the zip container anyway." + tickets: [950081] + + - title: "Textile output: Fix issue with blockquotes and sentences getting removed." + + - title: "MOBI Output: When using the prefer author sort conversion option, handle multiple authors better." + tickets: [947146] + + - title: "Fix regression in 0.8.41 that broke direct connection to iDevices in windows" + tickets: [944534] + + - title: "Fix the download bulk metadata completed popup causing a crash if the Esc key is pressed." + tickets: [943056] + + - title: "Fix rating values doubled in CSV/XML catalogs" + tickets: [942790] + + - title: "EPUB Input: Remove non markup documents from the spine automatically, instead of erroring out" + + - title: "When formatting ratings in templates, etc., do not have an unnecessary .0" + + - title: "Calibre portable: Do not allow calibre portable to run if it is placed in a location whose path is too long. Also hide the library location setup in the welcome wizard when running the portable build." + + - title: "Fix regression in 0.8.41 that broke calibre if the TMP or TEMP environment variable is set to the root of a drive." + tickets: [952284] + + - title: "Fix display of ratings type custom fields in the content server" + tickets: [940600] + + + improved recipes: + - La Jornada + - Chicago Tribune + - Mediapart + - rue89 + + new recipes: + - title: Racjonalista + author: Racjonlista + + - title: JAPAA + author: adoucette + + +- version: 0.8.41 + date: 2012-02-24 + + new features: + - title: "Driver for Sony Experia Play 4G" + tickets: [938831] + + - title: "News download system: Allow use of __future__ in recipes, and do not change line numbers of code in the recipe when compiling it" + + - title: "Use the My Documents folder as the default location for the Calibre Library folder on first start in windows" + tickets: [934840] + + - title: "Add a tweak to Preferences->Tweaks to control the order in which categories appear in the Tag Browser" + + - title: "Tag Browser: Add an entry to the right click menu to quickly delete tags" + tickets: [934509] + + - title: "Amazon metadata download: Try to scrape series information from the amazon details page. Note that currently very few books have series info available. Often the page for hardcover will have series, but the Kindle edition will not. In such cases calibre may or may not find the series, depending on which page it ends up using." + + - title: "Content server: Add favicon to OPDS feeds." + tickets: [934731] + + bug fixes: + - title: "RTF Input: Fix some WMF images embedded in RTF files being distorted on conversion." + tickets: [934167] + + - title: "Fix long standing bug preventing calibre from working on east asian windows installs when the user name in windows has non-ascii characters" + tickets: [937389] + + - title: "Get Books: Fix Baen Webscription and O'Reilly stores. Fix price detection for Google Books" + + - title: "MOBI Output: When the same anchor is present more than once in the input document, use the first occurrence rather than the last one." + tickets: [934031] + + - title: "Use the 'default cover font' tweak when generating default masthead images as well" + tickets: [939256] + + - title: "Fix content server does not correctly display custom field of type 'rating'" + tickets: [938303] + + - title: "Fix welcome wizard does not save send-from email info unless send-to field is filled" + tickets: [937087] + + - title: "When reading metadata from odt files, use initial-creator in preference to creator for setting the author field" + tickets: [934564] + + - title: "Fix conversion erroring out when the input document has very long and thin images" + tickets: [935234] + + improved recipes: + - The Sun + - Various Polish news sources + - Mediapart + + new recipes: + - title: La pausa caffe + author: faber1971 + + - title: Various Polish news sources + author: fenuks + + +- version: 0.8.40 + date: 2012-02-17 + + new features: + - title: "Amazon metadata download: Support the new 'Book Description' section that Amazon publishes for some books. Also workaround the amazon US servers occasionally returning broken markup leading to calibre not finding any matches for books on Amazon." + + - title: "Kindle driver: Add an option to allow using page counts stored in a custom column. Go to Preferences->Plugins and customize the Kindle driver, to tell it to use a custom column to get page count data. See http://www.mobileread.com/forums/showpost.php?p=1963075&postcount=215 for details." + + - title: "Template language: Add a current_library_name() function that can be used to return the name of the currently opened library in calibre" + + - title: "Driver for Xperia Neo and PocketBook A10" + tickets: [930788] + + bug fixes: + - title: "Fix regression in 0.8.36 that caused the calibredb command to not properly refresh format information in standalone calibre-server processes" + + - title: "Fix regression in 0.8.39 that broke getting covers from some epub files on OS X." + tickets: [932507] + + - title: "Reading metadata from HTML files: Do not take a very long time for very large HTML files. Also fix reading metadata from meta tags with multiple spaces before the content attribute." + tickets: [932262] + + - title: "EPUB Output: Fix splitting breaking internal links in the epub, if the links pointed to files with URL unsafe characters in their file names." + tickets: [929966] + + - title: "Fix auto adding not leaving languages field blank when book has no defined laguage" + tickets: [930648] + + improved recipes: + - Samanyolu Haber + - Kurier + - Le devoir + - Daily Mirror + - Common Dreams + - Pescanik + + new recipes: + - title: Asian Review of Books + author: Darko Miletic + + - title: Albert Mohler, Desiring God, Living Stones and Resurgence + author: Peter Grungi + + - title: Novinite BG + author: M3 Web + + - title: Catholic Daily Readings + author: adoucette + + - title: Consortium News and Microwave and RF magazine + author: kiavash + +- version: 0.8.39 + date: 2012-02-10 + + new features: + - title: "Auto-adding: Add an option to check for duplicates when auto adding." + tickets: [926962] + + - title: "Content server: Export a second record via mDNS that points to the full OPDS feed in addition to the one pointing to the Stanza feed. The new record is of type _calibre._tcp." + tickets: [929304] + + - title: "Allow specifying a set of categories that are not partitioned even if they contain a large number of items in the Tag Browser. Preference is available under Look & Feel->Tag Browser" + + - title: "Allow setting a URL prefix for the content server that run embedded in the calibre GUI as well." + tickets: [928905] + + - title: "Allow output of identifiers data in CSV/XML/BiBTeX catalogs" + tickets: [927737] + + - title: "Driver for Motorola Droid XT910, Nokia E71 and HTC EVO 3D." + tickets: [928202, 927818, 929400] + + - title: "Cut down the time taken to launch worker processes by 40%" + + - title: "You can now configure the calibre settings for the currently connected device by right clicking on the device icon in the toolbar, instead of having to go through Preferences->Plugins" + + bug fixes: + - title: "Auto-adding: Do not add incomplete files when files are downloaded directly into the auto add folder." + tickets: [926578] + + - title: "When running multiple delete from device jobs, fix the device view sometimes marking the wrong books as being deleted, after the first delete job completes." + tickets: [927972] + + - title: "MOBI Input: Handle files that have spurious closing and/or tags in their markup." + tickets: [925833] + + - title: "RTF Input: Strip out false color specifications, as they cause artifacts when converted to MOBI" + + improved recipes: + - Updated Postmedia publications + - Foreign Affairs + - Read It Later + - Microwave Journal + - taggeschau.de + + new recipes: + - title: Vancouver Province and Windsor Star + author: Nick Redding + + - title: Onda Rock + author: faber1971 + + - title: Il Manifesto + author: Giacomo Lacava + +- version: 0.8.38 + date: 2012-02-03 + + new features: + - title: "Implement the ability to automatically add books to calibre from a specified folder." + type: major + description: "calibre can now watch a folder on your computer and instantly add any files you put there to the calibre library as new books. You can tell calibre which folder to watch via Preferences->Adding Books->Automatic Adding." + tickets: [920249] + + - title: "Conversion: When automatically inserting page breaks, do not put a page break before a

or

tag if it is immediately preceded by another

or

tag." + + - title: "Driver for EZReader T730 and Point-of-View PlayTab Pro" + tickets: [923283, 922969] + + bug fixes: + - title: "Fix device entry not visible in menubar even when it has been added via Preferences->Toolbars." + tickets: [923175] + + - title: "Fix metadata plugboards not applied when auto sending news by email" + + - title: "Fix regression in 0.8.34 that broke recipes that used skip_ad_pages() but not get_browser(). " + tickets: [923724] + + - title: "Restore device support on FreeBSD, by using HAL" + tickets: [924503] + + - title: "Get books: Show no more than 10 results from the Gandalf store" + + - title: "Content server: Fix metadata not being updated when sending for some MOBI files." + tickets: [923130] + + - title: "Heuristic processing: Fix the italicize common patterns algorithm breaking on some HTML markup." + tickets: [922317] + + - title: "When trying to find an ebook inside a zip file, do not fail if the zip file itself contains other zip files." + tickets: [925670] + + - title: "EPUB Input: Handle EPUBs with duplicate entries in the manifest." + tickets: [925831] + + - title: "MOBI Input: Handle files that have extra tags sprinkled through out their markup." + tickets: [925833] + + improved recipes: + - Metro Nieuws NL + - FHM UK + + new recipes: + - title: Strange Horizons + author: Jim DeVona + + - title: Telegraph India and Live Mint + author: Krittika Goyal + + - title: High Country News + author: Armin Geller + + - title: Countryfile + author: Dave Asbury + + - title: Liberation (subscription version) + author: Remi Vanicat + + - title: Various Italian news sources + author: faber1971 + + +- version: 0.8.37 + date: 2012-01-27 + + new features: + - title: "Allow calibre to be run simultaneously in two different user accounts on windows." + tickets: [919856] + + - title: "Driver for Motorola Photon and Point of View PlayTab" + tickets: [920582, 919080] + + - title: "Add a checkbox to preferences->plugins to show only user installed plugins" + + - title: "Add a restart calibre button to the warning dialog that pops up after changing some preference that requires a restart" + + bug fixes: + - title: "Fix regression in 0.8.36 that caused the remove format from book function to only delete the entry from the database and not delete the actual file from the disk" + tickets: [921721] + + - title: "Fix regression in 0.8.36 that caused the calibredb command to not properly refresh the format information in the GUI" + tickets: [919494] + + - title: "E-book viewer: Preserve the current position more accurately when changing font size/other preferences." + tickets: [912406] + + - title: "Conversion pipeline: Fix items in the that refer to files with URL unsafe filenames being ignored." + tickets: [920804] + + - title: "Fix calibre not running on linux systems that set LANG to an empty string" + + - title: "On first run of calibre, ensure the columns are sized appropriately" + + - title: "MOBI Output: Do not collapse whitespace when setting the comments metadata in newly created MOBI files" + + - title: "HTML Input: Fix handling of files with ä characters in their filenames." + tickets: [919931] + + - title: "Fix the sort on startup tweak ignoring more than three levels" + tickets: [919584] + + - title: "Edit metadata dialog: Fix a bug that broke adding of a file to the book that calibre did not previously know about in the books directory while simultaneously changing the author or title of the book." + tickets: [922003] + + improved recipes: + - People's Daily + - Plus Info + - grantland.com + - Eret es irodalom + - Sueddeutsche.de + + new recipes: + - title: Mumbai Mirror + author: Krittika Goyal + + - title: Real Clear + author: TMcN + + - title: Gazeta Wyborcza + author: ravcio + + - title: The Daily News Egypt and al masry al youm + author: Omm Mishmishah + + - title: Klip.me + author: Ken Sun + + +- version: 0.8.36 + date: 2012-01-20 + + new features: + - title: "Decrease startup time for large libraries with at least one composite custom column by reading format info on demand" + + - title: "When automatically deleting news older than x days, from the calibre library, only delete the book if it both has the tag News and the author calibre. This prevents accidental deletion of books tagged with News by the user." + + - title: "Driver for Infibeam Pi 2" + + - title: "Add a Tag Editor for tags like custom columns to the edit metadata dialog" + + bug fixes: + - title: "E-book viewer: Fix regression in 0.8.35 that caused viewer to raise an error on books that did not define a language" + + - title: "Content server: Fix grouping for categories based on custom columns." + tickets: [919011] + + - title: "Edit metadata dialog: When setting the series from a format or via metadata download, ensure that the series index is not automatically changed, when closing the dialog." + tickets: [918751] + + - title: "When reading metadata from Topaz (azw1) files, handle non ascii metadata correctly." + tickets: [917419] + + - title: "CHM Input: Do not choke on CHM files with non ascii internal filenames on windows." + tickets: [917696] + + - title: "Fix reading metadata from CHM files with non-ascii titles" + + - title: "Fix HTML 5 parser choking on comments" + + - title: "If calibre is started from a directory that does not exist, automatically use the home directory as the working directory, instead of crashing" + + - title: "Fix iriver story HD Wi-Fi device and external SD card swapped" + tickets: [916364] + + - title: "Content server: Fix ugly URLs for specific format download in the book details and permalink panels" + + - title: "When adding FB2 files do not set the date field from the metadata in the file" + + improved recipes: + - OReilly Premuim + - Variety + - Blic + - New Journal of Physics + - Der Tagesspiegel + + new recipes: + - title: Tweakers.net + author: Roedi06 + + - title: Village Voice + author: Barty + + - title: Edge.org Conversations + author: levien + + - title: Novi list - printed edition + author: Darko Miletic + +- version: 0.8.35 + date: 2012-01-13 + + new features: + - title: "Metadata plugboards: Allow creation of plugboards for email delivery." + tickets: [914012] + + - title: "Tweak EPUB: Also allow tweaking of HTMLZ files (when both EPUB and HTMLZ are present, EPUB is preferred, this can be changed via Preferences->Tweaks)." + + - title: "TXT Input: Support direct conversion of files with extensions .md, .markdown and .textile." + tickets: [912931] + + - title: "E-book viewer: Speed up the optional hyphenation algorithm by upgrading the hyphenator library calibre uses" + + - title: "Drivers for PocketBook 611, Motorola Razr Droid and Onyx Boox i62" + + bug fixes: + - title: "MOBI Output: When converting a paragraph that contains only a non-breaking space into a line break, ignore paragraphs with height less than 2pt." + tickets: [915150] + + - title: "MOBI Input: Handle MOBI files that specify anchor point exactly at pagebreaks. These are apparently produced by John Wiley and Sons." + tickets: [914036] + + - title: "Fetch news dialog: The Download now button is no longer scrolled out of view on OS X for news sources that require credentials" + + - title: "Fix commas being removed from author names when generating filenames in the calibre library" + + - title: "ODT Input: Dont crash on empty links" + + - title: "ebook-convert: Allow use of leading ./ when specifying output file names." + tickets: [913954] + + - title: "Fix deleting of hierarchical searches broken in Tag Browser" + tickets: [912345] + + - title: "Metadata search and replace: Fix rendering error when choosing {template}" + tickets: [913154] + + - title: "Fix calibre not starting when stray .po files are present in the working directory" + tickets: [913054] + + - title: "Do not error out when getting metadata for authors if the author name has either ::: or :#: in it." + tickets: [912713] + + improved recipes: + - Pagina 12 + - USA Today + - LWN Weekly + - Seattle Times + - San Jose Mercury + - Grantland.com + + new recipes: + - title: Lega Nerd and Pambianco + author: faber1971 + + - title: Various Turkish news sources + author: asalet_r + + - title: Microwave Journal + author: Kiavash + + - title: OReilly Premium + author: TechnoCat + + - title: Hamilton Spectator and Tillsonburg/Norfolk County + author: Eric Coolman + + - title: Opinion Bolivia + author: Piet van Oostrum + + - title: ideal.es + author: Josemi Liebana + + - title: Novilist Portal + author: Darko Miletic + +- version: 0.8.34 + date: 2012-01-06 + + new features: + - title: "Apple driver: Set the year field in iTunes based on the published date in calibre." + tickets: [909050] + + - title: "EPUB Input: When converting a file that has entries in the manifest that do no exist, remove them, instead of aborting the conversion." + tickets: [910933] + + - title: "Kindle driver: Ensure page counts are correctly sent to the device when connecting to Kindle 4/Touch." + tickets: [910279] + + - title: "Allow user to set the number of recently viewed books shown in the dropdown menu of the view button, via a tweak in Preferences->Tweaks." + tickets: [910292] + + bug fixes: + - title: "Fix regression in 0.8.33 that caused calibre to crash when starting the Content Server, if the port the content server is trying to listen on is blocked/busy." + tickets: [910512] + + - title: "MOBI Input: Fix regression that caused a mixup of images when the MOBI file header contains an incorrect first image index pointer." + tickets: [911243] + + - title: "Do not remove leading and trailing spaces from the replace fields in the Search and Replace conversion options" + tickets: [910523] + + - title: "Conversion pipeline: Fix regression in 0.8.31 that broke parsing of documents containing a self closing tag." + tickets: [910325] + + improved recipes: + - Kopalnia Wiedzy + - Alternet + - Tagesspiegel + - Philadelphia Inquirer + - Seattle Times + - La Razon + + new recipes: + - title: Various Italian news sources + author: faber1971 + + - title: money.pl + author: intromatyk + + - title: Diario Rio Negro + author: Darko Miletic. + + - title: FHM UK + author: Dave Asbury + +- version: 0.8.33 + date: 2011-12-30 + + new features: + - title: "LIT Input: Switch to non-recursive algorithm, to allow conversion of files with deeply nested markup." + tickets: [909535] + + - title: "Content server: Do not show the original_* formats in the mobile interface. Also upgrade to the latest CherryPy release." + + - title: "E-book viewer: Add option in viewer preferences to control how much the font size is changed when you click the make fonts bigger/smaller buttons." + tickets: [908980] + + - title: "E-book viewer: Allow changing font size via Ctrl+Mouse wheel" + tickets: [908975] + + - title: "Kobo driver: Hide previews and recommendations from the book list. You can customize the Kobo plugin if you would like to see them via Preferences->Plugins" + + bug fixes: + - title: "Copy to library: Fix title sort not being copied" + + - title: "PDF Output: Add custom size conversion option to the GUI (it was only present on the command line before)" + + - title: "Add missing --keep-ligatures option to the ebook-convert command line" + tickets: [909182] + + - title: "Fix rendering of non ascii characters in generated masthead images when downloading news for the Kindle" + + - title: "Linux binary: Disable qt-sp-api as it causes crashes/performance issues on various distros" + + - title: "E-book viewer: Ensure that reference mode highlighting is removed from the book when reference mode is closed." + tickets: [908982] + + - title: "Fix unable to load plugins from files on GNOME/XFCE desktops" + + - title: "Fix regression that broke customizing toolbars on non English calibre installs" + + - title: "Conversion pipeline: Disable HTML 5 parsing if it results in deeply nested trees." + tickets: [908818] + + - title: "Do not loose position in book list on device connection/sync, if a search is active." + tickets: [908553] + + - title: "Fix regression in 0.8.32 that broke deleting books if the path to the library contained non-ascii characters on linux" + tickets: [908068] + + improved recipes: + - Echo Online + - La Razon + + new recipes: + - title: NYTimes Global + author: Krittika Goyal + + - title: Grantland + author: Barty + +- version: 0.8.32 + date: 2011-12-23 + + new features: + - title: "Linux: When deleting books, send them to the recycle bin, instead of permanently deleting. This is the same behavior as on Windows and OS X." + + - title: "Add a checkbox to allow users to disable the popup that asks if books should be auto-converted before sending to device" + + - title: "Drivers for Droid Razr, Samsung GT-I9003 and Bookeen Odyssey" + tickets: [906356, 906056, 905862] + + - title: "Allow passing multiple filenames as command line arguments to calibre, to add multiple books." + tickets: [907968] + + bug fixes: + - title: "MOBI Output: Fix regression in 0.8.30 that caused the use of hidden heading elements for the TOC to generate links in the wrong place." + tickets: [907156] + + - title: "EPUB Output: Ensure directories have the correct permissions bits set when unzipping an epub with unzip on Unix" + + - title: "Fix bottom most shortcuts in keyboard shortcuts for viewer not editable" + + - title: "EPUB Output: Fix handling of self closing <audio> tags." + tickets: [906521] + + - title: "MOBI Input: Map invalid <o:p> tags to <p> tags before parsing, to handle broken nesting." + tickets: [905715] + + - title: "Conversion pipeline: HTML5 parsing: Fix handling of XML namespaces. Fixes regression in 0.8.30 that caused some articles in some news downloads to appear blank when viewed in Adobe Digital Editions based readers" + + - title: "Get Books: Gandalf store, fix price and cover detection" + + - title: "EPUB Output: Fix the Flatten filenames option in EPUB Output causing duplicated manifest ids in rare cases." + tickets: [905692] + + - title: "When adding books via ISBN, show the user the list of invalid ISBNs that will be ignored, if any, before starting the add operation." + tickets: [905690] + + - title: "Fix unsmarten punctuation conversion option broken in 0.8.31." + tickets: [905596] + + - title: "Fix broken evaluation of composite columns in save-to-disk" + + improved recipes: + - Cosmopolitan UK + - Hindustan Times + - HVG + - moneynews.com + - Ming Pao + - Glasgow Herald + - Times of India + - Focus Magazine + - Hacker News + - Independent + - Sueddeutsche + + new recipes: + - title: Prospect Magazine UK + author: Barty and duoloz + + - title: Elet es Irodalom and NOL + author: Bigpapa + + - title: Salonica Press News + author: SteliosGero + + - title: Echo Online + author: Armin Geller + + - title: Various Polish news sources + author: fenuks + + - title: Various Italian news sources + author: faber1971 + +- version: 0.8.31 + date: 2011-12-16 + + new features: + - title: "Conversion engine: When parsing invalid XHTML use the HTML 5 algorithm, for greater robustness." + tickets: [901466] + + - title: "Driver for PocketBook 611 and Lenovo IdeaPad" + + - title: "Allow customization of the order in which custom column editing is performed in the edit metadata dialog. Setting is available via Preferences->Tweaks." + tickets: [902731] + + - title: "MOBI news download: Allow recipes to set a thumbnail for entries in the periodical table of contents. Currently used by the NYTimes, WSJ, Independent, GUardian and Globe and Mail recipes" + tickets: [900130] + + - title: "E-book viewer: Add an option to the right click menu to search for the currently selected word" + + - title: "Automatically hide the no internet connection available error message if the connection is restored before the user clicks OK" + + bug fixes: + - title: "Fix comments not hidden in Book details panel when they are turned off via Preferences->Look & Feel->Book Details" + + - title: "E-book viewer: Do not popup an error message if the user tries to use the mouse wheel to scroll before a document is loaded." + tickets: [903449] + + - title: "Add docx to the list of ebook extensions." + tickets: [903452] + + - title: "When downloading metadata from non-English Amazon websites, do not correct the case of book titles." + + - title: "Fix regression in 0.8.30 that broke bulk conversion of a single book." + tickets: [902506] + + - title: "When minimized to system tray do not display the no internet connection error as a dialog box, instead use a system tray notification" + + - title: "Catalog generation: Include the series_index field for custom series columns as well" + + - title: "Comic Input: Do not rescale images when using the Tablet output profile (or any output profile with a screen size larger than 3000x3000)" + + - title: "HTML Input: Ignore unparseable URLs instead of crashing on them." + tickets: [902372] + + + improved recipes: + - La Republica + - CND + - Berliner Zeitung + - Zaman Gazetesi + + new recipes: + - title: CND Weekly + author: Derek Liang + + - title: descopera.org + author: Marius Ignatescu + + - title: Rynek Zdrowia + author: spi630 + +- version: 0.8.30 + date: 2011-12-09 + + new features: + - title: "Get Books: Add amazon.es and amazon.it" + + - title: "Bulk convert dialog: Disable the Use saved conversion settings checkbox when none of the books being converted has saved conversion settings" + + - title: "ebook-viewer: And a command line switch to specify the position at which the file should be opened." + tickets: [899325] + + - title: "Distribute calibre source code compressed with xz instead of gzip for a 40% reduction in size" + + bug fixes: + - title: "Get Books: Fix ebooks.com and amazon.fr. Fix cover display in Diesel ebooks store." + + - title: "HTML Input: Fix regression that broke processing of a small fraction of HTML files encoded in a multi-byte character encoding." + tickets: [899691] + + - title: "Greatly reduce the delay at the end of a bulk metadata edit operation that operates on a very large number (thousands) of books" + + - title: "Template language: Fix the subitems formatter function to split only when the period is surrounded by non-white space and not another period" + + - title: "Fix ampersands in titles not displaying in the Cover Browser" + + - title: "MOBI Output: Do not ignore an empty anchor at the end of a block element." + + - title: "MOBI Output: Handle links to inline anchors placed inside large blocks of text correctly, i.e. the link should not point to the start of the block." + tickets: [899831] + + - title: "E-book viewer: Fix searching for text that is represented as entities in the underlying HTML." + tickets: [899573] + + - title: "Have the Esc shortcut perform exactly the same set of actions as clicking the clear button." + tickets: [900048] + + - title: "Prevent the adding books dialog from becoming too wide" + + - title: "Fix custom column editing not behaving correctly with the Previous button in the edit metadata dialog." + tickets: [899836] + + - title: "T1 driver. More fixes to datetime handling to try to convince the T1's buggy firmware to not rescan metadata." + tickets: [899514] + + - title: "Only allow searching via non accented author names if the user interface language in calibre is set to English." + tickets: [899227] + + improved recipes: + - Die Zeit subscription + - Metro UK + - suedeutsche.de + + new recipes: + - title: Blues News + author: Oskar Kunicki + + - title: "TVXS" + author: Hargikas + + +- version: 0.8.29 + date: 2011-12-02 + + new features: + - title: "When searching for author names with accented characters, allow the non accented version to match. For example, searching for Nino will now match Niño." + tickets: [879729] + + - title: "Driver for Blackberry Playbook, Motorola Electrify and Samsung Galaxy GIO S5660" + tickets: [805745,898123,897330] + + - title: "Metadata search and replace, make the regular expressions unicode aware" + + bug fixes: + - title: "Fix regression in 0.8.28 that broke sending PDF files to iTunes" + tickets: [896791] + + - title: "Metadata download, do not strip # from titles." + tickets: [898310] + + - title: "Conversion pipeline: Do not error out on books that set font size to zero." + tickets: [898194] + + - title: "News download: Respect the delay setting when downloading RSS feeds as well." + tickets: [897907] + + - title: "EPUB Output: Ensure that xml:lang is set if lang is set as ADE looks for xml:lang, not lang" + tickets: [897531] + + - title: "Content server: Reduce memory consumption when sending very large files" + tickets: [897343] + + - title: "Preserve capitalization of Scottish author names when downloading metadata" + + - title: "Fix update title sort in bulk metadata edit not using language information" + + - title: "Fix sorting by published column in the download metadata dialog broken" + tickets: [896832] + + - title: "Allow use of languages field when generating CSV/XML catalogs" + tickets: [896620] + + - title: "Get Books: Fix ebookpoint.pl" + + - title: "When calculating title sort for a book based on its language, only use the specified language not a combination of the language and english" + tickets: [896412] + + improved recipes: + - Metro NL + - Ming Pao + - Rolling Stones Mag + - Buffalo News + + new recipes: + - title: gs24.pl and Gazeta.pl Szczecin + author: Michal Szkutnik + + - title: Vanity Fair + author: Barty + + - title: Skylife + author: thomass + + - title: Daily Writing Tips + author: NotTaken + + - title: TechDirt + author: Krittika Goyal + + - title: Cosmopolitan UK + author: Dave Asbury + +- version: 0.8.28 + date: 2011-11-25 + + new features: + - title: "Get Books: Add litres.ru store" + + - title: "Change the algorithm that generates title sort strings to strip leading articles from both english and the current language set for the calibre user interface. In addition, in the edit metadata dialog, calibre will use the book's language when calculating the sort string. This behavior can be adjusted via Preferences->Tweaks." + tickets: [886763] + + - title: "Driver for Cybook Odyssey." + tickets: [893457] + + - title: "Irex driver: Put books into the top level directory instead of into /ebooks or /Books." + tickets: [883616] + + bug fixes: + - title: "Have downloaded periodicals recognized when transferred via USB to the Kindle Fire" + + - title: "MOBI Output: Fix underline and strikethrough properties declared on parents not being rendered on child tags." + tickets: [894245] + + - title: "Template language: Fix regression that broke ordering of items when formatting a list" + + - title: "Conversion pipeline: When removing obsolete <font> tags convert them to <div> instead of <span> if they contain block level tags." + tickets: [892525] + + - title: "When downloading metadata, fix the case normalization of double-barelled author names." + tickets: [893257] + + - title: "Template language: Fix regression that broke using general program mode in save to disk templates" + + - title: "calibredb: Fix use of ranges when specifying ids for the remove command" + + - title: "Apple driver: Add ids for iPhone 4S. More robust against iTunes automation errors when adding artwork." + tickets: [892468] + + - title: "Fix encoding of comments incorrectly detected when downloading metadata from ozon.ru" + + - title: "Fix calibre not getting list of books on the Kindle Fire" + + improved recipes: + - El Mundo + - BBC + - NIN Online + - ABC Australia + - Salon.com + - Expansion (Spanish) + - The Week + - Heise Online + + new recipes: + - title: Give me something to read and Let's get Critical + author: Barty + + - title: Worldcrunch + author: Krittika Goyal + +- version: 0.8.27 + date: 2011-11-18 + + new features: + - title: "Drivers for the Kindle Fire and the Nook Tablet" + tickets: [890918] + + - title: "Conversion: Add an option under Look & Feel to remove specified style information (CSS) from the document during conversion." + tickets: [871384] + + - title: "Add an option in the bulk metadata edit dialog to restore the pre-conversion files for many books with a single click." + tickets: [886116] + + - title: "Jobs list: Add the ability to search for and to hide jobs, useful if you have run a lot of jobs and the list is getting crowded." + tickets: [883734] + + - title: "Book jacket generation: Add ability to customize the book jacket template and add custom columns into the jacket." + tickets: [889912] + + - title: "MOBI Input: Performance improvement when viewing/converting a file with a lot of links" + + bug fixes: + - title: "Fix regression in 0.8.26 that broke disabling the update of particular fields during a bulk metadata download." + tickets: [889696] + + - title: "Get Books: Fix DRM status for legimi" + + - title: "When parsing for lxml via BeatifulSoup, use the calibre modified copy of BeautifulSoup (more robust)." + tickets: [889890] + + - title: "HTML Input: Handle double encoded URLs in img tags" + tickets: [889323] + + improved recipes: + - Various Polish recipes + - Academia Catavencu + - El Periodico de Aragon + - Weblogs SL + - Folha de Sao Paolo (subscription) + + new recipes: + - title: News on Japan + author: Krittika Goyal + + - title: Formula AS + author: Silviu Cotoara + + - title: Various Turkish news sources + author: Osman Kaysan + + - title: Infra.pl and Spider's Web + author: fenuks + + +- version: 0.8.26 + date: 2011-11-12 + + new features: + - title: "Tweak to control sorting of date type columns. You can choose to have them sorted only by displayed fields" + + - title: "Driver for the Trekstor 3.0" + + - title: "Performance improvements when evaluating templates, and in particular general program mode templates" + + bug fixes: + - title: "ODT Input: When converting to EPUB improve handling of large images placed inside small frames, to prevent them from obscuring text." + tickets: [860272,884759] + + - title: "EPUB Input: Automatically strip entries of type application/text from the spine. Apparently there are EPUB production tools out there that create them." + tickets: [884792] + + - title: "Keep the startup splash screen visible until the GUI has fully completed initializing." + tickets: [885827] + + - title: "ODT Input: Fix handling of span tags containing only whitespace." + tickets: [887311] + + - title: "On windows when changing title or author via the main book list, handle the case of one of the books files being open in another program more gracefully." + tickets: [880585] + + - title: "When adding a format to an existing book record, ensure that no changes are made to the database until after the file operations have succeeded." + + - title: "Fix bug that prevented configuring which fields to download metadata for when adding books by ISBN" + tickets: [856076] + + - title: "Fix Japanese characters not being crrectly displayed on index pages in news downloads for the SONY T1" + tickets: [888029] + + - title: "Get Books: Fix booleans in search expressions not working in non-English calibre versions" + tickets: [887554] + + - title: "Fix a bug in the support for hours/minutes/seconds in datetime format strings" + tickets: [887412] + + - title: "Treat an author_sort value of 'Unknown' the same way as unknown authors are treated in template processing" + + - title: "Detect SD card in Kobo Vox" + + - title: "Amazon metadata download: Workaround for change in Amazon website causing some books to have incorrect ratings downloaded" + + improved recipes: + - Metro NL + - The Independent + - Frankfurter Rundschau + - L'Espresso + - Il Giornale + - Berlingske.dk + - Suedeutsche Zeitung + + new recipes: + - title: Techtarget + author: Julio Map + - version: 0.8.25 date: 2011-11-06 diff --git a/Changelog.yaml b/Changelog.yaml index ec01df0107..a2e9130065 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,306 @@ # new recipes: # - title: +- version: 0.9.23 + date: 2013-03-15 + + new features: + - title: "New tool: \"Edit ToC\" that allows you to edit or create a Table of Contents easily in EPUB or AZW3 ebooks." + type: major + description: "Using the Edit ToC tool, you can easily re-arrange the entries in an existing Table of Contents, change their text and even change the location they point to by simply clicking the new location in the book. To use this tool, go to Preferences->Toolbar and add the Edit ToC tool to the main toolbar. Then simply select the books you want to be polished and click the Edit ToC button. This tool is based on a new codebase, so there may be bugs." + + - title: "Content server: Enable use of plugboards for mobi and azw3 formats" + + - title: "Windows driver for Tolino Shine" + tickets: [1153536] + + - title: "When copying books to another library, show the name of the destination library in the copy dialog" + tickets: [1153407] + + - title: "Allow running plugins from the command line with calibre-debug easily" + + bug fixes: + - title: "PDF Output: Fix bug causing left and right margins to be applied to the cover page. Also fix the preserve cover aspect ratio option not working correctly" + + - title: "PDF Output: Fix javascript dialog box popping up in the middle on converting very long documents." + tickets: [1154948] + + - title: "MOBI metadata: When setting the language in a MOBI file also update the language field in the EXTH header." + tickets: [1154351] + + - title: "MOBI metadata: Support writing of book producer field into MOBI files." + tickets: [1154353] + + - title: "Fix job progress and status not always updated" + tickets: [1154137] + + - title: "Fix conversion of zip.rar archives with very long title/author on windows" + tickets: [1153859] + + - title: "News download: Update the library used to parse RSS feeds." + tickets: [1152852] + + - title: "Fix Irex Illiad not recognized on OS X computers" + tickets: [824713] + + - title: "Fix entering a very long search in the find item in tag browser box causes the tag browser to no longer be shrinkable." + tickets: [1152870] + + - title: "Fix a bug in the zsh completion when converting for ebook-convert with PDF output" + + improved recipes: + - Various Polish news sources + - Harpers Full + - kath.net + - Smithsonian + + new recipes: + - title: Deccan Herald + author: Muruli Shamanna + + - title: What If + author: kisnick + + - title: The Friday Times + tickets: Krittika Goyal + + - title: Computer Woche + author: Maria Seliger + + - title: Lamebook + author: atordo + +- version: 0.9.22 + date: 2013-03-08 + + new features: + - title: "Linux driver for the Tolino ebook reader." + tickets: [1151901] + + - title: "Kobo driver: Add support for the new 'Archived' collections in Kobo firmware 2.4.0 and improve handling recommendations and previews." + tickets: [1150852] + + - title: "Metadata search and replace: Allow replacing the entire set of identifiers instead of only a specific identifier when doing a search and replace. To do this, choose a source field other than identifiers and set the destination identifier type to: *" + + - title: "Show a brief description of every action when customizing toolbars in Preferences->Toolbars" + + - title: "Allow drag and drop of books from the calibre book list onto the button for Convert book, Edit metadata, Remove Book, Tweak Book, Polish books, etc." + + bug fixes: + - title: "CHM Input: Fix incorrect decoding for CHM files whose hhc file is also a content file." + tickets: [1151721] + + - title: "Conversion: Add the double low quote to list of characters that are converted to ascii." + tickets: [1152207] + + - title: "Amazon metadata download: Update plugin to handle changes to Amazon site that could cause some covers to not be downloaded. Also fix finding metadata for books with + in their titles." + + - title: "Content server: Fix a bug that prevented the cover being updated when files are downloaded from the content server" + + - title: "Conversion: Handle the use of @import CSS rules inside <style> tags in HTML files" + + - title: "Book polishing: Do not error out when polishing epub files that have XML comments in their OPF metadata section." + + - title: "Book polishing: Do not error out when updating covers in EPUB files that have entries int heir manifest that point to missing files" + + - title: "Book polishing: Fix a bug that could cause updating covers to error out in some books" + + - title: "Fix updating the calibre application id in EPUBs that also use the application id as the package id." + + - title: "Apple driver: Fix bug preventing sending books to iBooks if no books have been previously added to iBooks." + tickets: [1141078] + + - title: "EPUB/AZW3 Output: Fix splitting on page breaks ignored if the page breaks are inside an element which itself has a page-break-after style applied." + + - title: "EPUB/AZW3 Output: Fix incorrect splitting of html at page-break-after page breaks in certain circumstances (The split element being the first child of a parent that contains other split elements)." + tickets: [1139317] + + improved recipes: + - Le Devoir + - New York Times Book Review + - Various Polish news sources + + new recipes: + - title: Various new Polish news sources + +- version: 0.9.21 + date: 2013-03-01 + + new features: + - title: "Content server: When browsing random books, add a button to the book page to get another random book." + tickets: [1134958] + + - title: "Kobo driver: Update cover uploading for Kobo firmware 2.3.1 and later." + tickets: [1135649] + + - title: "Add a tweak under Preferences->Tweaks to try to recoginze numbers inside text fields like title when sorting. This will cause Book 2 to sort before Book 100. However, it is slower and can have trouble when the text starts with a number. If you want numeric sorting you should use the series field for it." + tickets: [1132025] + + - title: "Get Books: Update the Amazon and Foyles store plugins" + + - title: "Add a setting in Preferences->Tweaks that controls the sorting of the Copy to Library and Quick Switch menus. If the number of libraries is larger than the set value, the lists are sorted alphabetically instead of by frequency of use." + tickets: [1133691] + + - title: "Driver for Iriver Story EB12." + tickets: [1132583] + + - title: "Edit metadata dialog: When pasting in copied text into the comments area, you can now choose to discard all formatting. Right click on the comments area and select 'Paste and Match style' which will paste the copied text as plain text formatted in the current style." + + - title: "Book polishing: Make updating cover a separate option, so you can now update metadata without updating the cover." + + - title: "Linux build: Install zsh completion for the calibre command line utilities" + + bug fixes: + - title: "Conversion: Do not rescale fonts sizes/adjust line heights for text based drop caps defined using a separate <span> tag (drop caps defined using :first-letter were already handled correctly)" + + - title: "E-book viewer: Fix clicking links going to slightly incorrect locations in some books." + tickets: [1132641] + + - title: "E-book viewer: Fix rendering of pages for right-to-left text in paged mode is reversed." + tickets: [1132626] + + - title: "E-book viewer: Fix bug in rendering prefixed svg tags in the cover pages of some EPUB files." + + - title: "PDF Output: Do not error out when embedding a font that calibre cannot subset, instead embed the full font" + + - title: "Book polishing: Fix bug that caused the ORIGINAL_EPUB format to be replaced by the EPUB format when polishing a book with both ORIGINA_EPUB and EPUB" + + - title: "Polishing books: Ignore unsupported fonts instead of erroring out on them." + tickets: [1132085] + + - title: 'Make bulk edit of custom columns respect the "apply changes" checkbox even if the value to set has not changed' + + improved recipes: + - Science News + - Die Zeit (subscription version) + + new recipes: + - title: Financial Times (US subscription version) and Nezavisne Novine + author: Darko Miletic + + - title: Geopolityka + author: chemik111 + + - title: Democracy Journal + author: David Nye + + - title: HNOnline + author: Ladislav Lencucha + + - title: Various Colombian news sources + author: Ismael Mejia + +- version: 0.9.20 + date: 2013-02-22 + + new features: + - title: "Book polishing: Add an option to smarten punctuation in the book when polishing" + + - title: "Book polishing: Add an option to delete all saved settings to the load saved settings button" + + - title: "Book polishing: Remember the last used settings" + + - title: "Book polishing: Add a checkbox to enable/disable the detailed polishing report" + + - title: "Add a separate tweak in Preferences-Tweaks for saving backups of files when polishing. That way you can have calibre save backups while converting EPUB->EPUB and not while polishing, if you so desire." + + - title: "Content server: Allow clicking on the book cover to download it. Useful on small screen devices where clicking the Get button may be difficult" + + - title: "Driver for Energy Systems C4 Touch." + tickets: [1127477] + + bug fixes: + - title: "E-book viewer: Fix a bug that could cause the back button in the viewer to skip a location" + + - title: "When tweaking/polishing an azw3 file that does not have an identified content ToC, do not auto-generate one." + tickets: [1130729] + + - title: "Book polishing: Use the actual cover image dimensions when creating the svg wrapper for the cover image." + tickets: [1127273] + + - title: "Book polishing: Do not error out on epub files containing an iTunesMetadata.plist file." + tickets: [1127308] + + - title: "Book polishing: Fix trying to polish more than 5 books at a time not working" + + - title: "Content server: Add workaround for bug in latest release of Google Chrome that causes it to not work with book lists containing some utf-8 characters" + tickets: [1130478] + + - title: "E-book viewer: When viewing EPUB files, do not parse html as xhtml even if it has svg tags embedded. This allows malformed XHTML files to still be viewed." + + - title: "Bulk metadata edit Search & recplace: Update the sample values when changing the type of identifier to search on" + + - title: "Fix recipes with the / character in their names not useable from the command line" + tickets: [1127666] + + - title: "News download: Fix regression that broke downloading of images in gif format" + + - title: "EPUB/AZW3 Output: When splitting the output html on page breaks, handle page-break-after rules correctly, the pre split point html should contain the full element" + + - title: "Fix stdout/stderr redirection temp files not being deleted when restarting calibre from within calibre on windows" + + - title: "E-book viewer: When viewing epub files that have their cover marked as non-linear, show the cover at the start of the book instead of the end." + tickets: [1126030] + + - title: "EPUB Input: Fix handling of cover references with fragments in the urls" + + improved recipes: + - Fronda + - Various Polish news sources + + new recipes: + - title: Pravda + author: Darko Miletic + + - title: PNN + author: n.kucklaender + + - title: Various Polish news sources + author: fenuks + +- version: 0.9.19 + date: 2013-02-15 + + new features: + - title: "New tool: \"Polish books\" that allows you to perform various automated cleanup actions on EPUB and AZW3 files without doing a full conversion." + type: major + description: "Polishing books is all about putting the shine of perfection on your ebook files. You can use it to subset embedded fonts, update the metadata in the book files from the metadata in the calibre library, manipulate the book jacket, etc. More features will be added in the future. To use this tool, go to Preferences->Toolbar and add the Polish books tool to the main toolbar. Then simply select the books you want to be polished and click the Polish books button. Polishing, unlike conversion, does not change the internal structure/markup of your book, it performs only the minimal set of actions needed to achieve its goals. Note that polish books is a completely new codebase, so there may well be bugs, polishing a book backs up the original as ORIGINAL_EPUB or ORIGINAL_AZW3, unless you have turned off this feature in Preferences->Tweaks, in which case you should backup your files manually. You can also use this tool from the command line with ebook-polish.exe." + + - title: "Driver for the Trekstor Pyrus Mini." + tickets: [1124120] + + - title: "E-book viewer: Add an option to change the minimum font size." + tickets: [1122333] + + - title: "PDF Output: Add support for converting documents with math typesetting, as described here: http://manual.calibre-ebook.com/typesetting_math.html" + + - title: "Column coloring/icons: Add more conditions when using date based columns with reference to 'today'." + + bug fixes: + - title: "Transforming to titlecase - handle typographic hyphens in all caps phrases" + + - title: "Dont ignore file open events that occur before the GUI is initialized on OS X" + tickets: [1122713] + + - title: "News download: Handle feeds that have entries with empty ids" + + - title: "Fix a regression that broke using the template editor" + + - title: "Do not block startup while scanning the computer for available network interfaces. Speeds up startup time on some windows computers with lots of spurious network interfaces." + + improved recipes: + - New Yorker + - Kommersant + - Le Monde (Subscription version) + - NZ Herald + + new recipes: + - title: Navegalo + author: Douglas Delgado + + - title: El Guardian and More Intelligent Life + author: Darko Miletic + - version: 0.9.18 date: 2013-02-08 @@ -1876,1682 +2176,3 @@ - title: La gazetta del Mezzogiorno author: faber1971 -- version: 0.8.53 - date: 2012-05-25 - - new features: - - title: "Kindle Touch/4 driver: Upload cover thumbnails when sending books to device by USB to workaround Amazon bug of not displaying covers for sync-enabled books" - - - title: "Support for updating metadata in FB2 files" - - - title: "Set a different background color when choosing formats to not delete as opposed to choosing format to delete." - tickets: [ 1001741 ] - - - title: "E-book viewer: Add an option to prevent the up and down arrow keys from scrolling past page breaks" - - - title: "Get Books: Remove ebookshoppe.com at the website's request" - - bug fixes: - - title: "PDF Input: Support image rotation commands in PDF files. Fixes the long standing problem of some images being flipped when converting from PDF in calibre." - - - title: "Fix a regression in 0.8.51 that caused conversion to HTMLZ to not have any CSS" - - - title: "Get Books: Fix website change at kobo.com causing prices not to be found" - - - title: "Edit the time in the 24 hour clock when calibre's interface language is set to German." - tickets: [ 1001809 ] - - - title: "MOBI Output: When generating joint KF8/MOBI6 .mobi files set the text length field in the MOBI 6 header correctly. " - tickets: [ 1003489 ] - - - title: "ODT Input: More workarounds for LibreOffice 3.5's habit of inserting pointless margin:100% directives everywhere." - tickets: [ 1002702 ] - - - title: "Fix regression that broke smarten punctuation when quotes were next to html tags." - tickets: [ 998900 ] - - - title: "Fix published date from ozon.ru wrong in some timezones" - tickets: [ 975338 ] - - - title: "Catalogs: Handle the use of custom columns with non-ascii names correctly" - tickets: [1001437] - - - title: "Conversion pipeline: Remove the attempt to detect and autocorrect if text will go off the left edge of the page, as it was a rather crude heuristic. Also do not remove fake margins if the book uses negative text indents on the margined elements." - - - title: "KF8 Output: Set offsets to tags in the skeleton the same way kindlegen does. Also linearize non linear ToCs to ensure section to section jumping works." - - - title: "Conversion pipeline: Use correct default value of 'inherit' for font-family and font-size when normalizing the shorthand font property." - - - title: "When running python scripts via calibre-debug ensure that user plugins are loaded" - - improved recipes: - - Business Week Magazine - - Metro Nieuws NL - - new recipes: - - title: Attac.es - author: Marc Busque - - - title: Drytooling.com - author: Damian Granowski - - - title: Shortlist.com - author: Dave ASbury - - - title: National Geographic (es) - author: vakya - -- version: 0.8.52 - date: 2012-05-18 - - new features: - - title: "EPUB Input: When setting the cover for a book that identifies its cover image, but not the html wrapper around the cover, try to detect and remove that wrapper automatically." - tickets: [ 999959 ] - - - title: "When deleting books of a specific format, show the number of books with each format available" - - - title: "Linux install: No longer create MAN pages as all utilities have more comprehensive command line --help anyway" - - - title: "Add a tweak Preferences->Tweaks to control the default choice of format for the Tweak Book feature" - - - title: "Conversion: Allow setting negative page margins. A negative page margin means that calibre will not specify any page margin in the output document (for formats that support this)" - - bug fixes: - - title: "Tweak book: Fix handling of covers when tweaking KF8 books" - - - title: "KF8 Output: Handle input documents with out of sequence ToC entries. Note that currently section jumping in the KF8 output produced by calibre for such files does not work." - tickets: [1000493] - - - title: "Edit metadata dialog: Fix the edit values button for custom tag-like columns showing a unneeded warning about changed values" - - - title: "EPUB Output: Be a little more conservative when removing <form> tags. Only remove them if they have actual forms inside. " - tickets: [ 1000384 ] - - - title: "EPUB Input: Correctly update the Cover entry in the ToC even when the entry has a fragment reference. " - tickets: [ 999973 ] - - - title: "Update ImagMagick DLLs in all calibre binary builds to fix security vulnerabilities in ImageMagick" - tickets: [ 999496 ] - - - title: "Advanced search dialog: Fix equals and regex matching not being applied for custom column searches." - tickets: [ 980221 ] - - - title: "RTF Input: Handle old RTF files that have commands without braces." - tickets: [ 994133 ] - - - title: "Get Books: Diesel, fix results not showing when only a single match is found" - - - title: "Get Books: Fix DRM status indicators for Kobo and Diesel stores. Fix smashwords not returning results." - tickets: [ 993755 ] - - - title: "Fix regression in 0.8.51 that broke viewing of LIT and some EPUB files" - tickets: [998248, 998216] - - improved recipes: - - Clarin - - Spiegel - - Spiegel International - - Montreal Gazette - - Gosc Niedzelny - - Ars Technica - - new recipes: - - title: "Army/Navy/Air force/Marine Times and News busters" - author: jde - - - title: "Ads of the World, Heavy Meta (Italian) and Juve La Stampa" - author: faber1971 - - - title: "Revista Summa" - author: Vakya - - - title: "Strategic culture" - author: Darko Miletic - - - title: Stars and Stripes - author: adoucette - - - title: Nackdenkseiten - author: jrda - - -- version: 0.8.51 - date: 2012-05-11 - - new features: - - title: "When switching libraries preserve the position and selected books if you switch back to a previously opened library." - tickets: [994514] - - - title: "Conversion pipeline: Filter out the useless font-face rules inserted by Microsoft Word for every font on the system" - - - title: "Driver for Motorola XT875 and Pandigital SuperNova" - tickets: [996890] - - - title: "Add a colour swatch the the dialog for creating column coloring rules, to ease selection of colors" - tickets: [994811] - - - title: "EPUB Output: Consolidate internal CSS generated by calibre into external stylesheets for ease of editing the EPUB" - - - title: "List EPUB and MOBI at the top of the dropdown list fo formats to convert to, as they are the most common choices" - tickets: [994838] - - bug fixes: - - title: "E-book viewer: Improve performance when switching between normal and fullscreen views." - tickets: [996102] - - - title: "Edit metadata dialog: When running download metadata do not insert duplicate tags into the list of tags" - - - title: "KF8 Input: Do not error out if the file has a few invalidly encoded bytes." - tickets: [997034] - - - title: "Fix download of news in AZW3 format not working" - tickets: [996439] - - - title: "Pocketbook driver: Update for new PB 611 firmware." - tickets: [903079] - - - title: "ebook-convert: Error out if the user prvides extra command line args instead of silently ignoring them" - tickets: [994939] - - - title: "EPUB Output: Do not self close any container tags to prevent artifacts when EPUBs are viewed using buggy browser based viewers." - tickets: [994861] - - - title: "Fix regression in 0.8.50 that broke the conversion of HTML files that contained non-ascii font-face declarations, typically produced by Microsoft Word" - - improved recipes: - - Mainichi news - - derStandard - - Endgadget Japan - - new recipes: - - title: Mainichi English - author: Hiroshi Miura - - - title: The Grid TO - author: Yusuf W - - - title: National Geographic (Italy) - author: faber1971 - - - title: Rebelion - author: Marc Busque - -- version: 0.8.50 - date: 2012-05-04 - - new features: - - title: "Tweak Book: Allow tweaking of KF8 MOBI files. Useful to fine-tune the result of a conversion. Right click on the book and select Tweak Book to use the feature. Note that tweaking a MOBI file that contains both KF8 and older MOBI6 will cause the MOBI6 version to be discarded." - - - title: "AZW3 output plugin. This output plugin generates pure KF8 mobi files. These only work on the Kindle Fire and Kindle Touch with latest firmware." - - - title: "Conversion: Allow easy re-ordering of the search and replace expressions in the conversion dialog. Also apply the expressions in the same order that they were entered when doing the conversion." - - - title: "Automatically add the Tag 'Sample Book' when an Amazon sample is added to calibre" - - - title: "FB2 Input: Better handling of inline images." - tickets: [989869] - - bug fixes: - - title: "KF8 Output: Fix section to section jumps not working for documents with multi-level ToCs" - - - title: "EPUB Input: Handle the case of the metadata ToC containing a reference to the cover HTML file." - tickets: [993812] - - - title: "CHM Input: Handle files with deeply nested markup and non html files listed at the start of the manifest." - tickets: [993607] - - - title: "KF8 Output: Workaround Kindle Touch bug that causes the book to be rendered as black pages when a height is specified for <body>" - - - title: "Fix regression in 0.8.49 that broke italics detection in heuristic processing on 32-bit systems." - tickets: [991380] - - - title: "KF8 Output: Fix joint MOBI6/KF8 books not being recognized as MOBI files by older Kindles" - - - title: "KF8 Output: Fix errors when processing documents with HTML comments and/or XML processing instructions" - - - title: "Get Books: Amazon fix prices not being found. B&N fix details link. ebooks.com: fix cover image. Website changes to various EU stores" - - - title: "FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded." - tickets: [990929] - - - title: "Fix scrolling with the cover browser updating only the selection in the book list, not the current book." - tickets: [990881] - - - title: "Save to Disk: Do not run out memory when saving very large files on systems with low RAM." - tickets: [990741] - - - title: "FB2 Output: Use 2 letter language codes in preference to 3-letter ones to not break poorly implemented FB2 readers" - tickets: [990026] - - - title: "EPUB Input: Auto set the media-type for OPF manifest entries with an empty media-type" - - improved recipes: - - National Post - - Daily Mirror - - Sun - - Newsweek Polska - - Max-Planck - - derStandard - - tweakers.net - - new recipes: - - title: George Monbiot - author: Darko Miletic - - - title: El Mundo - author: atordo - - - title: AraInfo and Diagonal - author: Ruben Pollan - - -- version: 0.8.49 - date: 2012-04-27 - - new features: - - title: "Experimental support for generating Amazon's new KF8 format MOBI files" - description: "calibre can now generate Amazon's new KF8 format MOBI files. - To turn on this feature, go to Preferences->Tweaks and click Plugin Tweaks. In the box add: - test_mobi_output_type = 'both' - calibre will now produce MOBI files that have both the old MOBI format and the new KF8 format in them. - To learn more about KF8, see: http://www.amazon.com/gp/feature.html?docId=1000729511 - Note that calibre support for KF8 is still experimental and there will likely be bugs." - - - title: "Upgrade to using cssutils 0.9.9 for CSS parsing. Improved speed and robustness." - - - title: "Show cover size in a tooltip in the conversion dialog" - tickets: [986958] - - - title: "Driver for Nook Simple Touch with Glow Light" - tickets: [989264] - - bug fixes: - - title: "Heuristics: When italicizing words do not operate on words not in between HTML tags." - tickets: [986298] - - - title: "Fix (I hope) the bulk metadata download process crashing for some people on OS X when clicking the Yes button to apply the updates." - tickets: [986658] - - - title: "Fix tooltip not being updated in the book details panel when pasting in a new cover" - tickets: [986958] - - - title: "Cover Browser: Wrap the title on space only, not in between words." - tickets: [986516] - - - title: "Edit metadata dialog: If a permission denied error occurs when clicking the next or prev buttons, stay on the current book." - tickets: [986903] - - - title: "Fix heuristics not removing unnecessary hyphens from the end of lines." - tickets: [822744] - - improved recipes: - - Metro Nieuws NL - - Der Tagesspiegel - - new recipes: - - title: Berria - author: Alayn Gortazar - - - title: Sol Haber - author: Onur Gungor - - - title: Telam - author: Darko Miletic - - - title: Richmond Times-Dispatch - author: jde - -- version: 0.8.48 - date: 2012-04-20 - - new features: - - title: "Conversion: The search and replace feature has been completely revamped." - description: "You can now use any number of search and replace - expression, not just three. You can also store and load frequently used - sets of search and replace expressions. Also, the wizard generates its - preview in a separate process to protect against crashes/memory leaks." - tickets: [983476,983484,983478] - - - title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free." - - - title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X" - tickets: [981185] - - bug fixes: - - title: "Get Books: Support the new website design of Barnes & Noble" - - - title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted." - tickets: [943586] - - - title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'" - - - title: "MOBI Output: Handle background color specified on <td> and <tr> in addition to <table> tags." - tickets: [980813] - - - title: "MOBI Output: Fix underline style applied to parent element not getting inherited by <a> children." - tickets: [985711] - - improved recipes: - - xkcd - - Metro Nieuws - - Calgary Herald - - Orlando Sentinel - - countryfile - - Heise - - new recipes: - - title: Various new Polish news sources - author: fenuks - - - title: Various Italian news sources - author: faber1971 - - - title: Jakarta Globe - author: rty - - - title: Acim Bilim Dergisi - author: thomass - -- version: 0.8.47 - date: 2012-04-13 - - new features: - - title: "Conversion pipeline: Add support for all the named entities in the HTML 5 spec." - tickets: [976056] - - - title: "Support for viewing and converting the Haodoo PDB ebook format" - tickets: [976478] - - - title: "Device driver for Laser EB720" - - bug fixes: - - title: "Fix regression in automatic adding in 0.8.46 that broke automatic adding if adding of duplicates is enabled and auto convert is also enabled" - tickets: [976336] - - - title: 'Fix "Tags" field in advanced search does not obey regex setting' - tickets: [980221] - - - title: "EPUB Input: Automatically extract cover image from simple HTML title page that consists of only a single <img> tag, instead of rendering the page" - - - title: "Prevent errors when both author and author_sort are used in a template for reading metadata from filenames for files on a device" - - - title: "Amazon metadata download: Handle books whose titles start with a bracket." - tickets: [976365] - - - title: "Get Books: Fix downloading of purchased books from Baen" - tickets: [975929] - - - improved recipes: - - Forbes - - Caros Amigos - - Trouw - - Sun UK - - Metro - - Daily Mirror - - new recipes: - - title: "Melbourne Herald Sun" - author: Ray Hartley - - - title: "Editoriali and Zerocalcare" - author: faber1971 - -- version: 0.8.46 - date: 2012-04-06 - - new features: - - title: "Auto adding: When automatically adding files from a folder, automatically convert the files to the current output format after adding. This can be turned off via Preferences->Adding Books->Automatic Adding." - tickets: [969053] - - - title: "E-book viewer: When reading a MOBI file that is actually a KF8 book, show the format as being KF8" - - - title: "Content server: Workaround for android stock browser not support HTTP AUTH." - - - title: "Edit metadata dialog: Change the remove unused series button to a clear series button (as the remove unused series function is now automatic)" - - - title: "Driver for PocketBook 622." - tickets: [969875] - - bug fixes: - - title: "Run metadata downloads in a separate process to workaround memory leaks in third party plugins. Also removes the need to break up bulk metadata downloads into 100 book batches." - - - title: "Make tag browser filtering work when capital letters are entered." - - - title: "EPUB metadata: Ignore urn:isbn: prefix from ISBN declaration when reading metadata" - - - title: "Get books: Fix feedbooks store not showing all available formats" - - - title: "KF8 Input: When the KF8 book has no metadata ToC, try to extract the ToC from the HTML instead." - tickets: [969238] - - - title: "Fix regression that broke access to Preferences via the Preferences item in the calibre menu on OS X" - tickets: [969418] - - - title: "Fix bug that ignored metadata specified on the command line when using calibredb add" - - improved recipes: - - OReilly Premium - - Real Clear - - Soldier's Magazine - - Rue89 - - new recipes: - - title: The Southern Star - author: watou - - - title: Buenos Aires Herald - author: Darko Miletic - -- version: 0.8.45 - date: 2012-03-30 - - new features: - - title: "E-book viewer: Allow the up and down keys to scroll past section boundaries" - - - title: "calibredb: Allow specification of basic metadata on the command line when adding books." - tickets: [951063] - - - title: "Driver for Samsung Galaxy Plus GT-I9001" - - - title: "KF8 Input: Support KF8 format Amazon book samples." - tickets: [963418] - - - title: "When a new plugin is added to calibre for the first time, have its icon (if any) show up even when a device is connected (this can be changed by the user at the time of plugin installation)" - - - title: "Add keyboard shortcuts for Bold, Italic and Underline to the comments editor in the edit metadata dialog" - tickets: [963559] - - bug fixes: - - title: "E-book viewer: Fix last read position (and bookmarks in general) being inaccurate for some books." - description: "The technique for marking locations in books used by the viewer has changed. The new technique should be much more accurate than the last one, especially when the font size at which the book is being viewed is changed. Note that this change means that bookmarks created with this release of calibre will not be read by previous calibre versions. On a technical note, the viewer now uses the CFI specification from the EPUB 3 standard for bookmarks." - type: major - - - title: "Workarounds for a few regressions in the user interface in 0.8.44 caused by the update to Qt 4.8.0" - - - title: "Books list: Preserve the horizontal scroll position when sorting by a column" - - - title: "Fix saving to disk and then adding the book back not restoring tags-like custom columns" - - - title: "Linux installer: Fix completion for ebook-convert not working." - tickets: [967834] - - - title: "MOBI Output: Recognize type=text in addition to type=start guide elements" - - - title: "Get Books: Updates to Nexto, Ebookpoint and Woblink stores" - - - title: "Fix unable to clear username/password in Fetch news dialog" - - - title: "PDF Output: Fix margin specifications not being applied" - - - title: "Linux installer: Manually preserve the defaults.list mimetype association file to workaround buggy xdg-desktop-menu implementations in some distros." - tickets: [926559] - - - title: "E-book viewer: Fix regression that caused the ebook viewer to stop functioning if it is launched from the main calibre program and then the main calibre program is closed." - tickets: [963960] - - - improved recipes: - - Our Daily Bread - - new recipes: - - title: NRC Handelsblad (free) - author: veezh - -- version: 0.8.44 - date: 2012-03-23 - - new features: - - title: "E-book viewer: A whole new full screen mode." - description: "The new mode has no toolbars to distract from the text and the ability to set the width of the column of text via Preferences in the ebook viewer. Click the Fullscreen button on the toolbar in the viewer to enter fullscreen mode (or press the F11 or Ctrl+Shit+F keys)" - type: major - tickets: [959830] - - - title: "Copy to Library: If books were auto merged by the copy to library process, popup a message telling the user about it, as otherwise some people forget they have turned on auto merge and accuse calibre of losing their books." - - - title: "Unix driver for Ectaco JetBook color" - tickets: [958442] - - - title: "Add a link to the 'Adding Books Preferences' in the drop down menu of the Add Books button for easier access and more prominence" - tickets: [958145] - - - title: "Smarten punctuation: Add a few more cases for detecting opening and closing quotes" - - bug fixes: - - title: "Get Books: Updates to various store plugins to deal with website changes: Amazon Europe, Waterstones, Foyles, B&N, Kobo, Woblink and Empik" - - - title: "Catalog generation: Do not error out when generating csv/xml catalogs if the catalog title contains filename invalid characters." - tickets: [960154] - - - title: "RTF Output: Ignore corrupted images in the input document, instead of erroring out." - tickets: [959600] - - - title: "E-book viewer: Try to preserve page position when the window is resized" - - - title: "Fix bug that caused wrong series to be shown when clicking on the first letter of a series group in the Tag Browser" - - - title: "Fix calibre not supporting different http and https proxies." - tickets: [960173] - - - title: "MOBI Input: Fix regression caused by KF8 support that broke reading of ancient non-Amazon PRC files" - - - title: "Fix EPUB to EPUB conversion of an EPUB with obfuscated fonts resulting in the fonts not being readable in Adobe Digital Editions" - tickets: [957527] - - - title: "RTF Output: Fix bug that broke conversion to RTF when the input document contains <img> tags with no src attribute." - - - title: "Fix regression in 0.8.43 that broke use of general mode templates that ended in a semi-colon." - tickets: [957295] - - improved recipes: - - b92 - - Various Polish news sources - - Le Monde - - FHM UK - - new recipes: - - title: Ivana Milakovic and Klub knjige - author: Darko Miletic - - -- version: 0.8.43 - date: 2012-03-16 - - new features: - - title: "Template language: Speedup evaluation of general program mode templates by pre-compiling them to python. If you experience errors with this optimization, you can turn it off via Preferences->Tweaks. Also other miscellaneous optimizations in evaluating templates with composite columns." - - - title: "MOBI Output: Add an option to not convert all images to JPEG when creating MOBI files. For maximum compatibility of the produced MOBI files, do not use this option." - tickets: [954025] - - - title: "Add iPad3 Output Profile" - - bug fixes: - - title: "KF8 Input: Add support for KF8 files with obfuscated embedded fonts" - tickets: [953260] - - - title: "Make the stars in the book list a little larger on windows >= vista" - - - title: "Revised periodical Section layout, for touchscreen devices resolving iBooks problem with tables spanning multiple pages" - - - title: "Read dc:contributor metadata from MOBI files" - - - title: "MOBI Output: Fix a regression that caused the generated thumbnail embedded in calibre produced MOBI files to be a large, low quality image instead of a small, high quality image. You would have been affected by this bug only if you directly used the output from calibre, without exporting it via send to device or save to disk." - tickets: [954254] - - - title: "KF8 Input: Recognize OpenType embedded fonts as well." - tickets: [954728] - - - title: "Fix regression in 0.8.41 that caused file:/// URLs to stop working in the news download system on windows." - tickets: [955581] - - - title: "When setting metadata in MOBI files fix cover not being updated if the mobi file has its first image record as the cover" - - - title: "Fix column coloring rules based on the size column not working" - tickets: [953737] - - improved recipes: - - Microwaves and RF - - idg.se - - new recipes: - - title: SatMagazine - author: kiavash - -- version: 0.8.42 - date: 2012-03-12 - - new features: - - title: "Support for reading Amazon's new KF8 format" - type: major - description: "calibre can now both view and convert MOBI files that contain Amazon's new KF8 (Kindle Fire) format" - - - title: "Add a tweak to Preferences->Tweaks to control the font size used in the book details panel" - tickets: [948357] - - - title: "Allow specifying a list of file types to exclude when automatically adding files from a folder" - tickets: [943025] - - - title: "Show ratings in the book details panel as stars. Also allow the user to change the alignment of the ratings column in the main books list. No longer display the stars in blue, instead their color can be customized via the column coloring rules, like any other column" - - - title: "When setting metadata in EPUB ensure that the <meta name=cover> tag has its name attribute first. Needed for the Nook." - - - title: "Drivers for Novo 7, LG G2x and Zenithink T-280" - tickets: [941671, 940625, 940527] - - - title: "Update linux binaries to Qt 4.8.0" - - bug fixes: - - title: "Fix some rar files causing crashes on OS X (updated libunrar.dylib in the OS X build)" - tickets: [951185] - - - title: "MOBI Output: Ignore the Table of Contents pointed to by the guide, if it contains no links" - - - title: "ODT Input: Ignore margin declaration in ODT styles if more specific margin-* declarations are present" - tickets: [941134] - - - title: "Conversion pipeline: Fix @import rules in CSS stylesheets that have comments on their first few lines being ignored." - - - title: "EPUB Input: When extracting the contents of epub files on windows, do not error out if one or more of the components in the epub file have filepaths containing characters that are invalid for the windows filesystem, instead, just replace those characters, since those entries are likely to be errors in the zip container anyway." - tickets: [950081] - - - title: "Textile output: Fix issue with blockquotes and sentences getting removed." - - - title: "MOBI Output: When using the prefer author sort conversion option, handle multiple authors better." - tickets: [947146] - - - title: "Fix regression in 0.8.41 that broke direct connection to iDevices in windows" - tickets: [944534] - - - title: "Fix the download bulk metadata completed popup causing a crash if the Esc key is pressed." - tickets: [943056] - - - title: "Fix rating values doubled in CSV/XML catalogs" - tickets: [942790] - - - title: "EPUB Input: Remove non markup documents from the spine automatically, instead of erroring out" - - - title: "When formatting ratings in templates, etc., do not have an unnecessary .0" - - - title: "Calibre portable: Do not allow calibre portable to run if it is placed in a location whose path is too long. Also hide the library location setup in the welcome wizard when running the portable build." - - - title: "Fix regression in 0.8.41 that broke calibre if the TMP or TEMP environment variable is set to the root of a drive." - tickets: [952284] - - - title: "Fix display of ratings type custom fields in the content server" - tickets: [940600] - - - improved recipes: - - La Jornada - - Chicago Tribune - - Mediapart - - rue89 - - new recipes: - - title: Racjonalista - author: Racjonlista - - - title: JAPAA - author: adoucette - - -- version: 0.8.41 - date: 2012-02-24 - - new features: - - title: "Driver for Sony Experia Play 4G" - tickets: [938831] - - - title: "News download system: Allow use of __future__ in recipes, and do not change line numbers of code in the recipe when compiling it" - - - title: "Use the My Documents folder as the default location for the Calibre Library folder on first start in windows" - tickets: [934840] - - - title: "Add a tweak to Preferences->Tweaks to control the order in which categories appear in the Tag Browser" - - - title: "Tag Browser: Add an entry to the right click menu to quickly delete tags" - tickets: [934509] - - - title: "Amazon metadata download: Try to scrape series information from the amazon details page. Note that currently very few books have series info available. Often the page for hardcover will have series, but the Kindle edition will not. In such cases calibre may or may not find the series, depending on which page it ends up using." - - - title: "Content server: Add favicon to OPDS feeds." - tickets: [934731] - - bug fixes: - - title: "RTF Input: Fix some WMF images embedded in RTF files being distorted on conversion." - tickets: [934167] - - - title: "Fix long standing bug preventing calibre from working on east asian windows installs when the user name in windows has non-ascii characters" - tickets: [937389] - - - title: "Get Books: Fix Baen Webscription and O'Reilly stores. Fix price detection for Google Books" - - - title: "MOBI Output: When the same anchor is present more than once in the input document, use the first occurrence rather than the last one." - tickets: [934031] - - - title: "Use the 'default cover font' tweak when generating default masthead images as well" - tickets: [939256] - - - title: "Fix content server does not correctly display custom field of type 'rating'" - tickets: [938303] - - - title: "Fix welcome wizard does not save send-from email info unless send-to field is filled" - tickets: [937087] - - - title: "When reading metadata from odt files, use initial-creator in preference to creator for setting the author field" - tickets: [934564] - - - title: "Fix conversion erroring out when the input document has very long and thin images" - tickets: [935234] - - improved recipes: - - The Sun - - Various Polish news sources - - Mediapart - - new recipes: - - title: La pausa caffe - author: faber1971 - - - title: Various Polish news sources - author: fenuks - - -- version: 0.8.40 - date: 2012-02-17 - - new features: - - title: "Amazon metadata download: Support the new 'Book Description' section that Amazon publishes for some books. Also workaround the amazon US servers occasionally returning broken markup leading to calibre not finding any matches for books on Amazon." - - - title: "Kindle driver: Add an option to allow using page counts stored in a custom column. Go to Preferences->Plugins and customize the Kindle driver, to tell it to use a custom column to get page count data. See http://www.mobileread.com/forums/showpost.php?p=1963075&postcount=215 for details." - - - title: "Template language: Add a current_library_name() function that can be used to return the name of the currently opened library in calibre" - - - title: "Driver for Xperia Neo and PocketBook A10" - tickets: [930788] - - bug fixes: - - title: "Fix regression in 0.8.36 that caused the calibredb command to not properly refresh format information in standalone calibre-server processes" - - - title: "Fix regression in 0.8.39 that broke getting covers from some epub files on OS X." - tickets: [932507] - - - title: "Reading metadata from HTML files: Do not take a very long time for very large HTML files. Also fix reading metadata from meta tags with multiple spaces before the content attribute." - tickets: [932262] - - - title: "EPUB Output: Fix splitting breaking internal links in the epub, if the links pointed to files with URL unsafe characters in their file names." - tickets: [929966] - - - title: "Fix auto adding not leaving languages field blank when book has no defined laguage" - tickets: [930648] - - improved recipes: - - Samanyolu Haber - - Kurier - - Le devoir - - Daily Mirror - - Common Dreams - - Pescanik - - new recipes: - - title: Asian Review of Books - author: Darko Miletic - - - title: Albert Mohler, Desiring God, Living Stones and Resurgence - author: Peter Grungi - - - title: Novinite BG - author: M3 Web - - - title: Catholic Daily Readings - author: adoucette - - - title: Consortium News and Microwave and RF magazine - author: kiavash - -- version: 0.8.39 - date: 2012-02-10 - - new features: - - title: "Auto-adding: Add an option to check for duplicates when auto adding." - tickets: [926962] - - - title: "Content server: Export a second record via mDNS that points to the full OPDS feed in addition to the one pointing to the Stanza feed. The new record is of type _calibre._tcp." - tickets: [929304] - - - title: "Allow specifying a set of categories that are not partitioned even if they contain a large number of items in the Tag Browser. Preference is available under Look & Feel->Tag Browser" - - - title: "Allow setting a URL prefix for the content server that run embedded in the calibre GUI as well." - tickets: [928905] - - - title: "Allow output of identifiers data in CSV/XML/BiBTeX catalogs" - tickets: [927737] - - - title: "Driver for Motorola Droid XT910, Nokia E71 and HTC EVO 3D." - tickets: [928202, 927818, 929400] - - - title: "Cut down the time taken to launch worker processes by 40%" - - - title: "You can now configure the calibre settings for the currently connected device by right clicking on the device icon in the toolbar, instead of having to go through Preferences->Plugins" - - bug fixes: - - title: "Auto-adding: Do not add incomplete files when files are downloaded directly into the auto add folder." - tickets: [926578] - - - title: "When running multiple delete from device jobs, fix the device view sometimes marking the wrong books as being deleted, after the first delete job completes." - tickets: [927972] - - - title: "MOBI Input: Handle files that have spurious closing </body> and/or </html> tags in their markup." - tickets: [925833] - - - title: "RTF Input: Strip out false color specifications, as they cause artifacts when converted to MOBI" - - improved recipes: - - Updated Postmedia publications - - Foreign Affairs - - Read It Later - - Microwave Journal - - taggeschau.de - - new recipes: - - title: Vancouver Province and Windsor Star - author: Nick Redding - - - title: Onda Rock - author: faber1971 - - - title: Il Manifesto - author: Giacomo Lacava - -- version: 0.8.38 - date: 2012-02-03 - - new features: - - title: "Implement the ability to automatically add books to calibre from a specified folder." - type: major - description: "calibre can now watch a folder on your computer and instantly add any files you put there to the calibre library as new books. You can tell calibre which folder to watch via Preferences->Adding Books->Automatic Adding." - tickets: [920249] - - - title: "Conversion: When automatically inserting page breaks, do not put a page break before a <h1> or <h2> tag if it is immediately preceded by another <h1> or <h2> tag." - - - title: "Driver for EZReader T730 and Point-of-View PlayTab Pro" - tickets: [923283, 922969] - - bug fixes: - - title: "Fix device entry not visible in menubar even when it has been added via Preferences->Toolbars." - tickets: [923175] - - - title: "Fix metadata plugboards not applied when auto sending news by email" - - - title: "Fix regression in 0.8.34 that broke recipes that used skip_ad_pages() but not get_browser(). " - tickets: [923724] - - - title: "Restore device support on FreeBSD, by using HAL" - tickets: [924503] - - - title: "Get books: Show no more than 10 results from the Gandalf store" - - - title: "Content server: Fix metadata not being updated when sending for some MOBI files." - tickets: [923130] - - - title: "Heuristic processing: Fix the italicize common patterns algorithm breaking on some HTML markup." - tickets: [922317] - - - title: "When trying to find an ebook inside a zip file, do not fail if the zip file itself contains other zip files." - tickets: [925670] - - - title: "EPUB Input: Handle EPUBs with duplicate entries in the manifest." - tickets: [925831] - - - title: "MOBI Input: Handle files that have extra </html> tags sprinkled through out their markup." - tickets: [925833] - - improved recipes: - - Metro Nieuws NL - - FHM UK - - new recipes: - - title: Strange Horizons - author: Jim DeVona - - - title: Telegraph India and Live Mint - author: Krittika Goyal - - - title: High Country News - author: Armin Geller - - - title: Countryfile - author: Dave Asbury - - - title: Liberation (subscription version) - author: Remi Vanicat - - - title: Various Italian news sources - author: faber1971 - - -- version: 0.8.37 - date: 2012-01-27 - - new features: - - title: "Allow calibre to be run simultaneously in two different user accounts on windows." - tickets: [919856] - - - title: "Driver for Motorola Photon and Point of View PlayTab" - tickets: [920582, 919080] - - - title: "Add a checkbox to preferences->plugins to show only user installed plugins" - - - title: "Add a restart calibre button to the warning dialog that pops up after changing some preference that requires a restart" - - bug fixes: - - title: "Fix regression in 0.8.36 that caused the remove format from book function to only delete the entry from the database and not delete the actual file from the disk" - tickets: [921721] - - - title: "Fix regression in 0.8.36 that caused the calibredb command to not properly refresh the format information in the GUI" - tickets: [919494] - - - title: "E-book viewer: Preserve the current position more accurately when changing font size/other preferences." - tickets: [912406] - - - title: "Conversion pipeline: Fix items in the <guide> that refer to files with URL unsafe filenames being ignored." - tickets: [920804] - - - title: "Fix calibre not running on linux systems that set LANG to an empty string" - - - title: "On first run of calibre, ensure the columns are sized appropriately" - - - title: "MOBI Output: Do not collapse whitespace when setting the comments metadata in newly created MOBI files" - - - title: "HTML Input: Fix handling of files with ä characters in their filenames." - tickets: [919931] - - - title: "Fix the sort on startup tweak ignoring more than three levels" - tickets: [919584] - - - title: "Edit metadata dialog: Fix a bug that broke adding of a file to the book that calibre did not previously know about in the books directory while simultaneously changing the author or title of the book." - tickets: [922003] - - improved recipes: - - People's Daily - - Plus Info - - grantland.com - - Eret es irodalom - - Sueddeutsche.de - - new recipes: - - title: Mumbai Mirror - author: Krittika Goyal - - - title: Real Clear - author: TMcN - - - title: Gazeta Wyborcza - author: ravcio - - - title: The Daily News Egypt and al masry al youm - author: Omm Mishmishah - - - title: Klip.me - author: Ken Sun - - -- version: 0.8.36 - date: 2012-01-20 - - new features: - - title: "Decrease startup time for large libraries with at least one composite custom column by reading format info on demand" - - - title: "When automatically deleting news older than x days, from the calibre library, only delete the book if it both has the tag News and the author calibre. This prevents accidental deletion of books tagged with News by the user." - - - title: "Driver for Infibeam Pi 2" - - - title: "Add a Tag Editor for tags like custom columns to the edit metadata dialog" - - bug fixes: - - title: "E-book viewer: Fix regression in 0.8.35 that caused viewer to raise an error on books that did not define a language" - - - title: "Content server: Fix grouping for categories based on custom columns." - tickets: [919011] - - - title: "Edit metadata dialog: When setting the series from a format or via metadata download, ensure that the series index is not automatically changed, when closing the dialog." - tickets: [918751] - - - title: "When reading metadata from Topaz (azw1) files, handle non ascii metadata correctly." - tickets: [917419] - - - title: "CHM Input: Do not choke on CHM files with non ascii internal filenames on windows." - tickets: [917696] - - - title: "Fix reading metadata from CHM files with non-ascii titles" - - - title: "Fix HTML 5 parser choking on comments" - - - title: "If calibre is started from a directory that does not exist, automatically use the home directory as the working directory, instead of crashing" - - - title: "Fix iriver story HD Wi-Fi device and external SD card swapped" - tickets: [916364] - - - title: "Content server: Fix ugly URLs for specific format download in the book details and permalink panels" - - - title: "When adding FB2 files do not set the date field from the metadata in the file" - - improved recipes: - - OReilly Premuim - - Variety - - Blic - - New Journal of Physics - - Der Tagesspiegel - - new recipes: - - title: Tweakers.net - author: Roedi06 - - - title: Village Voice - author: Barty - - - title: Edge.org Conversations - author: levien - - - title: Novi list - printed edition - author: Darko Miletic - -- version: 0.8.35 - date: 2012-01-13 - - new features: - - title: "Metadata plugboards: Allow creation of plugboards for email delivery." - tickets: [914012] - - - title: "Tweak EPUB: Also allow tweaking of HTMLZ files (when both EPUB and HTMLZ are present, EPUB is preferred, this can be changed via Preferences->Tweaks)." - - - title: "TXT Input: Support direct conversion of files with extensions .md, .markdown and .textile." - tickets: [912931] - - - title: "E-book viewer: Speed up the optional hyphenation algorithm by upgrading the hyphenator library calibre uses" - - - title: "Drivers for PocketBook 611, Motorola Razr Droid and Onyx Boox i62" - - bug fixes: - - title: "MOBI Output: When converting a paragraph that contains only a non-breaking space into a line break, ignore paragraphs with height less than 2pt." - tickets: [915150] - - - title: "MOBI Input: Handle MOBI files that specify anchor point exactly at pagebreaks. These are apparently produced by John Wiley and Sons." - tickets: [914036] - - - title: "Fetch news dialog: The Download now button is no longer scrolled out of view on OS X for news sources that require credentials" - - - title: "Fix commas being removed from author names when generating filenames in the calibre library" - - - title: "ODT Input: Dont crash on empty links" - - - title: "ebook-convert: Allow use of leading ./ when specifying output file names." - tickets: [913954] - - - title: "Fix deleting of hierarchical searches broken in Tag Browser" - tickets: [912345] - - - title: "Metadata search and replace: Fix rendering error when choosing {template}" - tickets: [913154] - - - title: "Fix calibre not starting when stray .po files are present in the working directory" - tickets: [913054] - - - title: "Do not error out when getting metadata for authors if the author name has either ::: or :#: in it." - tickets: [912713] - - improved recipes: - - Pagina 12 - - USA Today - - LWN Weekly - - Seattle Times - - San Jose Mercury - - Grantland.com - - new recipes: - - title: Lega Nerd and Pambianco - author: faber1971 - - - title: Various Turkish news sources - author: asalet_r - - - title: Microwave Journal - author: Kiavash - - - title: OReilly Premium - author: TechnoCat - - - title: Hamilton Spectator and Tillsonburg/Norfolk County - author: Eric Coolman - - - title: Opinion Bolivia - author: Piet van Oostrum - - - title: ideal.es - author: Josemi Liebana - - - title: Novilist Portal - author: Darko Miletic - -- version: 0.8.34 - date: 2012-01-06 - - new features: - - title: "Apple driver: Set the year field in iTunes based on the published date in calibre." - tickets: [909050] - - - title: "EPUB Input: When converting a file that has entries in the manifest that do no exist, remove them, instead of aborting the conversion." - tickets: [910933] - - - title: "Kindle driver: Ensure page counts are correctly sent to the device when connecting to Kindle 4/Touch." - tickets: [910279] - - - title: "Allow user to set the number of recently viewed books shown in the dropdown menu of the view button, via a tweak in Preferences->Tweaks." - tickets: [910292] - - bug fixes: - - title: "Fix regression in 0.8.33 that caused calibre to crash when starting the Content Server, if the port the content server is trying to listen on is blocked/busy." - tickets: [910512] - - - title: "MOBI Input: Fix regression that caused a mixup of images when the MOBI file header contains an incorrect first image index pointer." - tickets: [911243] - - - title: "Do not remove leading and trailing spaces from the replace fields in the Search and Replace conversion options" - tickets: [910523] - - - title: "Conversion pipeline: Fix regression in 0.8.31 that broke parsing of documents containing a self closing <title/> tag." - tickets: [910325] - - improved recipes: - - Kopalnia Wiedzy - - Alternet - - Tagesspiegel - - Philadelphia Inquirer - - Seattle Times - - La Razon - - new recipes: - - title: Various Italian news sources - author: faber1971 - - - title: money.pl - author: intromatyk - - - title: Diario Rio Negro - author: Darko Miletic. - - - title: FHM UK - author: Dave Asbury - -- version: 0.8.33 - date: 2011-12-30 - - new features: - - title: "LIT Input: Switch to non-recursive algorithm, to allow conversion of files with deeply nested markup." - tickets: [909535] - - - title: "Content server: Do not show the original_* formats in the mobile interface. Also upgrade to the latest CherryPy release." - - - title: "E-book viewer: Add option in viewer preferences to control how much the font size is changed when you click the make fonts bigger/smaller buttons." - tickets: [908980] - - - title: "E-book viewer: Allow changing font size via Ctrl+Mouse wheel" - tickets: [908975] - - - title: "Kobo driver: Hide previews and recommendations from the book list. You can customize the Kobo plugin if you would like to see them via Preferences->Plugins" - - bug fixes: - - title: "Copy to library: Fix title sort not being copied" - - - title: "PDF Output: Add custom size conversion option to the GUI (it was only present on the command line before)" - - - title: "Add missing --keep-ligatures option to the ebook-convert command line" - tickets: [909182] - - - title: "Fix rendering of non ascii characters in generated masthead images when downloading news for the Kindle" - - - title: "Linux binary: Disable qt-sp-api as it causes crashes/performance issues on various distros" - - - title: "E-book viewer: Ensure that reference mode highlighting is removed from the book when reference mode is closed." - tickets: [908982] - - - title: "Fix unable to load plugins from files on GNOME/XFCE desktops" - - - title: "Fix regression that broke customizing toolbars on non English calibre installs" - - - title: "Conversion pipeline: Disable HTML 5 parsing if it results in deeply nested trees." - tickets: [908818] - - - title: "Do not loose position in book list on device connection/sync, if a search is active." - tickets: [908553] - - - title: "Fix regression in 0.8.32 that broke deleting books if the path to the library contained non-ascii characters on linux" - tickets: [908068] - - improved recipes: - - Echo Online - - La Razon - - new recipes: - - title: NYTimes Global - author: Krittika Goyal - - - title: Grantland - author: Barty - -- version: 0.8.32 - date: 2011-12-23 - - new features: - - title: "Linux: When deleting books, send them to the recycle bin, instead of permanently deleting. This is the same behavior as on Windows and OS X." - - - title: "Add a checkbox to allow users to disable the popup that asks if books should be auto-converted before sending to device" - - - title: "Drivers for Droid Razr, Samsung GT-I9003 and Bookeen Odyssey" - tickets: [906356, 906056, 905862] - - - title: "Allow passing multiple filenames as command line arguments to calibre, to add multiple books." - tickets: [907968] - - bug fixes: - - title: "MOBI Output: Fix regression in 0.8.30 that caused the use of hidden heading elements for the TOC to generate links in the wrong place." - tickets: [907156] - - - title: "EPUB Output: Ensure directories have the correct permissions bits set when unzipping an epub with unzip on Unix" - - - title: "Fix bottom most shortcuts in keyboard shortcuts for viewer not editable" - - - title: "EPUB Output: Fix handling of self closing <audio> tags." - tickets: [906521] - - - title: "MOBI Input: Map invalid <o:p> tags to <p> tags before parsing, to handle broken nesting." - tickets: [905715] - - - title: "Conversion pipeline: HTML5 parsing: Fix handling of XML namespaces. Fixes regression in 0.8.30 that caused some articles in some news downloads to appear blank when viewed in Adobe Digital Editions based readers" - - - title: "Get Books: Gandalf store, fix price and cover detection" - - - title: "EPUB Output: Fix the Flatten filenames option in EPUB Output causing duplicated manifest ids in rare cases." - tickets: [905692] - - - title: "When adding books via ISBN, show the user the list of invalid ISBNs that will be ignored, if any, before starting the add operation." - tickets: [905690] - - - title: "Fix unsmarten punctuation conversion option broken in 0.8.31." - tickets: [905596] - - - title: "Fix broken evaluation of composite columns in save-to-disk" - - improved recipes: - - Cosmopolitan UK - - Hindustan Times - - HVG - - moneynews.com - - Ming Pao - - Glasgow Herald - - Times of India - - Focus Magazine - - Hacker News - - Independent - - Sueddeutsche - - new recipes: - - title: Prospect Magazine UK - author: Barty and duoloz - - - title: Elet es Irodalom and NOL - author: Bigpapa - - - title: Salonica Press News - author: SteliosGero - - - title: Echo Online - author: Armin Geller - - - title: Various Polish news sources - author: fenuks - - - title: Various Italian news sources - author: faber1971 - -- version: 0.8.31 - date: 2011-12-16 - - new features: - - title: "Conversion engine: When parsing invalid XHTML use the HTML 5 algorithm, for greater robustness." - tickets: [901466] - - - title: "Driver for PocketBook 611 and Lenovo IdeaPad" - - - title: "Allow customization of the order in which custom column editing is performed in the edit metadata dialog. Setting is available via Preferences->Tweaks." - tickets: [902731] - - - title: "MOBI news download: Allow recipes to set a thumbnail for entries in the periodical table of contents. Currently used by the NYTimes, WSJ, Independent, GUardian and Globe and Mail recipes" - tickets: [900130] - - - title: "E-book viewer: Add an option to the right click menu to search for the currently selected word" - - - title: "Automatically hide the no internet connection available error message if the connection is restored before the user clicks OK" - - bug fixes: - - title: "Fix comments not hidden in Book details panel when they are turned off via Preferences->Look & Feel->Book Details" - - - title: "E-book viewer: Do not popup an error message if the user tries to use the mouse wheel to scroll before a document is loaded." - tickets: [903449] - - - title: "Add docx to the list of ebook extensions." - tickets: [903452] - - - title: "When downloading metadata from non-English Amazon websites, do not correct the case of book titles." - - - title: "Fix regression in 0.8.30 that broke bulk conversion of a single book." - tickets: [902506] - - - title: "When minimized to system tray do not display the no internet connection error as a dialog box, instead use a system tray notification" - - - title: "Catalog generation: Include the series_index field for custom series columns as well" - - - title: "Comic Input: Do not rescale images when using the Tablet output profile (or any output profile with a screen size larger than 3000x3000)" - - - title: "HTML Input: Ignore unparseable URLs instead of crashing on them." - tickets: [902372] - - - improved recipes: - - La Republica - - CND - - Berliner Zeitung - - Zaman Gazetesi - - new recipes: - - title: CND Weekly - author: Derek Liang - - - title: descopera.org - author: Marius Ignatescu - - - title: Rynek Zdrowia - author: spi630 - -- version: 0.8.30 - date: 2011-12-09 - - new features: - - title: "Get Books: Add amazon.es and amazon.it" - - - title: "Bulk convert dialog: Disable the Use saved conversion settings checkbox when none of the books being converted has saved conversion settings" - - - title: "ebook-viewer: And a command line switch to specify the position at which the file should be opened." - tickets: [899325] - - - title: "Distribute calibre source code compressed with xz instead of gzip for a 40% reduction in size" - - bug fixes: - - title: "Get Books: Fix ebooks.com and amazon.fr. Fix cover display in Diesel ebooks store." - - - title: "HTML Input: Fix regression that broke processing of a small fraction of HTML files encoded in a multi-byte character encoding." - tickets: [899691] - - - title: "Greatly reduce the delay at the end of a bulk metadata edit operation that operates on a very large number (thousands) of books" - - - title: "Template language: Fix the subitems formatter function to split only when the period is surrounded by non-white space and not another period" - - - title: "Fix ampersands in titles not displaying in the Cover Browser" - - - title: "MOBI Output: Do not ignore an empty anchor at the end of a block element." - - - title: "MOBI Output: Handle links to inline anchors placed inside large blocks of text correctly, i.e. the link should not point to the start of the block." - tickets: [899831] - - - title: "E-book viewer: Fix searching for text that is represented as entities in the underlying HTML." - tickets: [899573] - - - title: "Have the Esc shortcut perform exactly the same set of actions as clicking the clear button." - tickets: [900048] - - - title: "Prevent the adding books dialog from becoming too wide" - - - title: "Fix custom column editing not behaving correctly with the Previous button in the edit metadata dialog." - tickets: [899836] - - - title: "T1 driver. More fixes to datetime handling to try to convince the T1's buggy firmware to not rescan metadata." - tickets: [899514] - - - title: "Only allow searching via non accented author names if the user interface language in calibre is set to English." - tickets: [899227] - - improved recipes: - - Die Zeit subscription - - Metro UK - - suedeutsche.de - - new recipes: - - title: Blues News - author: Oskar Kunicki - - - title: "TVXS" - author: Hargikas - - -- version: 0.8.29 - date: 2011-12-02 - - new features: - - title: "When searching for author names with accented characters, allow the non accented version to match. For example, searching for Nino will now match Niño." - tickets: [879729] - - - title: "Driver for Blackberry Playbook, Motorola Electrify and Samsung Galaxy GIO S5660" - tickets: [805745,898123,897330] - - - title: "Metadata search and replace, make the regular expressions unicode aware" - - bug fixes: - - title: "Fix regression in 0.8.28 that broke sending PDF files to iTunes" - tickets: [896791] - - - title: "Metadata download, do not strip # from titles." - tickets: [898310] - - - title: "Conversion pipeline: Do not error out on books that set font size to zero." - tickets: [898194] - - - title: "News download: Respect the delay setting when downloading RSS feeds as well." - tickets: [897907] - - - title: "EPUB Output: Ensure that xml:lang is set if lang is set as ADE looks for xml:lang, not lang" - tickets: [897531] - - - title: "Content server: Reduce memory consumption when sending very large files" - tickets: [897343] - - - title: "Preserve capitalization of Scottish author names when downloading metadata" - - - title: "Fix update title sort in bulk metadata edit not using language information" - - - title: "Fix sorting by published column in the download metadata dialog broken" - tickets: [896832] - - - title: "Allow use of languages field when generating CSV/XML catalogs" - tickets: [896620] - - - title: "Get Books: Fix ebookpoint.pl" - - - title: "When calculating title sort for a book based on its language, only use the specified language not a combination of the language and english" - tickets: [896412] - - improved recipes: - - Metro NL - - Ming Pao - - Rolling Stones Mag - - Buffalo News - - new recipes: - - title: gs24.pl and Gazeta.pl Szczecin - author: Michal Szkutnik - - - title: Vanity Fair - author: Barty - - - title: Skylife - author: thomass - - - title: Daily Writing Tips - author: NotTaken - - - title: TechDirt - author: Krittika Goyal - - - title: Cosmopolitan UK - author: Dave Asbury - -- version: 0.8.28 - date: 2011-11-25 - - new features: - - title: "Get Books: Add litres.ru store" - - - title: "Change the algorithm that generates title sort strings to strip leading articles from both english and the current language set for the calibre user interface. In addition, in the edit metadata dialog, calibre will use the book's language when calculating the sort string. This behavior can be adjusted via Preferences->Tweaks." - tickets: [886763] - - - title: "Driver for Cybook Odyssey." - tickets: [893457] - - - title: "Irex driver: Put books into the top level directory instead of into /ebooks or /Books." - tickets: [883616] - - bug fixes: - - title: "Have downloaded periodicals recognized when transferred via USB to the Kindle Fire" - - - title: "MOBI Output: Fix underline and strikethrough properties declared on parents not being rendered on child tags." - tickets: [894245] - - - title: "Template language: Fix regression that broke ordering of items when formatting a list" - - - title: "Conversion pipeline: When removing obsolete <font> tags convert them to <div> instead of <span> if they contain block level tags." - tickets: [892525] - - - title: "When downloading metadata, fix the case normalization of double-barelled author names." - tickets: [893257] - - - title: "Template language: Fix regression that broke using general program mode in save to disk templates" - - - title: "calibredb: Fix use of ranges when specifying ids for the remove command" - - - title: "Apple driver: Add ids for iPhone 4S. More robust against iTunes automation errors when adding artwork." - tickets: [892468] - - - title: "Fix encoding of comments incorrectly detected when downloading metadata from ozon.ru" - - - title: "Fix calibre not getting list of books on the Kindle Fire" - - improved recipes: - - El Mundo - - BBC - - NIN Online - - ABC Australia - - Salon.com - - Expansion (Spanish) - - The Week - - Heise Online - - new recipes: - - title: Give me something to read and Let's get Critical - author: Barty - - - title: Worldcrunch - author: Krittika Goyal - -- version: 0.8.27 - date: 2011-11-18 - - new features: - - title: "Drivers for the Kindle Fire and the Nook Tablet" - tickets: [890918] - - - title: "Conversion: Add an option under Look & Feel to remove specified style information (CSS) from the document during conversion." - tickets: [871384] - - - title: "Add an option in the bulk metadata edit dialog to restore the pre-conversion files for many books with a single click." - tickets: [886116] - - - title: "Jobs list: Add the ability to search for and to hide jobs, useful if you have run a lot of jobs and the list is getting crowded." - tickets: [883734] - - - title: "Book jacket generation: Add ability to customize the book jacket template and add custom columns into the jacket." - tickets: [889912] - - - title: "MOBI Input: Performance improvement when viewing/converting a file with a lot of links" - - bug fixes: - - title: "Fix regression in 0.8.26 that broke disabling the update of particular fields during a bulk metadata download." - tickets: [889696] - - - title: "Get Books: Fix DRM status for legimi" - - - title: "When parsing for lxml via BeatifulSoup, use the calibre modified copy of BeautifulSoup (more robust)." - tickets: [889890] - - - title: "HTML Input: Handle double encoded URLs in img tags" - tickets: [889323] - - improved recipes: - - Various Polish recipes - - Academia Catavencu - - El Periodico de Aragon - - Weblogs SL - - Folha de Sao Paolo (subscription) - - new recipes: - - title: News on Japan - author: Krittika Goyal - - - title: Formula AS - author: Silviu Cotoara - - - title: Various Turkish news sources - author: Osman Kaysan - - - title: Infra.pl and Spider's Web - author: fenuks - - -- version: 0.8.26 - date: 2011-11-12 - - new features: - - title: "Tweak to control sorting of date type columns. You can choose to have them sorted only by displayed fields" - - - title: "Driver for the Trekstor 3.0" - - - title: "Performance improvements when evaluating templates, and in particular general program mode templates" - - bug fixes: - - title: "ODT Input: When converting to EPUB improve handling of large images placed inside small frames, to prevent them from obscuring text." - tickets: [860272,884759] - - - title: "EPUB Input: Automatically strip entries of type application/text from the spine. Apparently there are EPUB production tools out there that create them." - tickets: [884792] - - - title: "Keep the startup splash screen visible until the GUI has fully completed initializing." - tickets: [885827] - - - title: "ODT Input: Fix handling of span tags containing only whitespace." - tickets: [887311] - - - title: "On windows when changing title or author via the main book list, handle the case of one of the books files being open in another program more gracefully." - tickets: [880585] - - - title: "When adding a format to an existing book record, ensure that no changes are made to the database until after the file operations have succeeded." - - - title: "Fix bug that prevented configuring which fields to download metadata for when adding books by ISBN" - tickets: [856076] - - - title: "Fix Japanese characters not being crrectly displayed on index pages in news downloads for the SONY T1" - tickets: [888029] - - - title: "Get Books: Fix booleans in search expressions not working in non-English calibre versions" - tickets: [887554] - - - title: "Fix a bug in the support for hours/minutes/seconds in datetime format strings" - tickets: [887412] - - - title: "Treat an author_sort value of 'Unknown' the same way as unknown authors are treated in template processing" - - - title: "Detect SD card in Kobo Vox" - - - title: "Amazon metadata download: Workaround for change in Amazon website causing some books to have incorrect ratings downloaded" - - improved recipes: - - Metro NL - - The Independent - - Frankfurter Rundschau - - L'Espresso - - Il Giornale - - Berlingske.dk - - Suedeutsche Zeitung - - new recipes: - - title: Techtarget - author: Julio Map - - diff --git a/manual/conversion.rst b/manual/conversion.rst index feae2a4273..817821a9b1 100644 --- a/manual/conversion.rst +++ b/manual/conversion.rst @@ -672,6 +672,7 @@ Some limitations of PDF input are: * Links and Tables of Contents are not supported * PDFs that use embedded non-unicode fonts to represent non-English characters will result in garbled output for those characters * Some PDFs are made up of photographs of the page with OCRed text behind them. In such cases |app| uses the OCRed text, which can be very different from what you see when you view the PDF file + * PDFs that are used to display complex text, like right to left languages and math typesetting will not convert correctly To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an output ranging anywhere from decent to unusable, depending on the input PDF. diff --git a/manual/develop.rst b/manual/develop.rst index 719c876b33..823a31b5c2 100644 --- a/manual/develop.rst +++ b/manual/develop.rst @@ -39,27 +39,27 @@ All the |app| python code is in the ``calibre`` package. This package contains t * devices - All the device drivers. Just look through some of the built-in drivers to get an idea for how they work. - * For details, see: devices.interface which defines the interface supported by device drivers and devices.usbms which + * For details, see: devices.interface which defines the interface supported by device drivers and ``devices.usbms`` which defines a generic driver that connects to a USBMS device. All USBMS based drivers in |app| inherit from it. * ebooks - All the ebook conversion/metadata code. A good starting point is ``calibre.ebooks.conversion.cli`` which is the - module powering the :command:`ebook-convert` command. The conversion process is controlled via conversion.plumber. - The format independent code is all in ebooks.oeb and the format dependent code is in ebooks.format_name. + module powering the :command:`ebook-convert` command. The conversion process is controlled via ``conversion.plumber``. + The format independent code is all in ``ebooks.oeb`` and the format dependent code is in ``ebooks.format_name``. - * Metadata reading, writing, and downloading is all in ebooks.metadata + * Metadata reading, writing, and downloading is all in ``ebooks.metadata`` * Conversion happens in a pipeline, for the structure of the pipeline, see :ref:`conversion-introduction`. The pipeline consists of an input plugin, various transforms and an output plugin. The that code constructs - and drives the pipeline is in plumber.py. The pipeline works on a + and drives the pipeline is in :file:`plumber.py`. The pipeline works on a representation of an ebook that is like an unzipped epub, with manifest, spine, toc, guide, html content, etc. The - class that manages this representation is OEBBook in oeb/base.py. The + class that manages this representation is OEBBook in ``ebooks.oeb.base``. The various transformations that are applied to the book during - conversions live in `oeb/transforms/*.py`. And the input and output - plugins live in `conversion/plugins/*.py`. + conversions live in :file:`oeb/transforms/*.py`. And the input and output + plugins live in :file:`conversion/plugins/*.py`. - * library - The database back-end and the content server. See library.database2 for the interface to the |app| library. library.server is the |app| Content Server. - * gui2 - The Graphical User Interface. GUI initialization happens in gui2.main and gui2.ui. The ebook-viewer is in gui2.viewer. + * library - The database back-end and the content server. See ``library.database2`` for the interface to the |app| library. ``library.server`` is the |app| Content Server. + * gui2 - The Graphical User Interface. GUI initialization happens in ``gui2.main`` and ``gui2.ui``. The ebook-viewer is in ``gui2.viewer``. If you need help understanding the code, post in the `development forum <http://www.mobileread.com/forums/forumdisplay.php?f=240>`_ and you will most likely get help from one of |app|'s many developers. diff --git a/manual/faq.rst b/manual/faq.rst index b5f8f382b1..ba11c865f3 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -250,42 +250,71 @@ If you don't want to uninstall it altogether, there are a couple of tricks you c simplest is to simply re-name the executable file that launches the library program. More detail `in the forums <http://www.mobileread.com/forums/showthread.php?t=65809>`_. -How do I use |app| with my iPad/iPhone/iTouch? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use |app| with my iPad/iPhone/iPod touch? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Over the air ^^^^^^^^^^^^^^ -The easiest way to browse your |app| collection on your Apple device (iPad/iPhone/iPod) is by using the calibre content server, which makes your collection available over the net. First perform the following steps in |app| +The easiest way to browse your |app| collection on your Apple device +(iPad/iPhone/iPod) is by using the |app| content server, which makes your +collection available over the net. First perform the following steps in |app| - * Set the Preferred Output Format in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) - * Set the output profile to iPad (this will work for iPhone/iPods as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` - * Convert the books you want to read on your iPhone to EPUB format by selecting them and clicking the Convert button. - * Turn on the Content Server in |app|'s preferences and leave |app| running. + * Set the Preferred Output Format in |app| to EPUB (The output format can be + set under :guilabel:`Preferences->Interface->Behavior`) + * Set the output profile to iPad (this will work for iPhone/iPods as well), + under :guilabel:`Preferences->Conversion->Common Options->Page Setup` + * Convert the books you want to read on your iDevice to EPUB format by + selecting them and clicking the Convert button. + * Turn on the Content Server by clicking the :guilabel:`Connect/Share` button + and leave |app| running. You can also tell |app| to automatically start the + content server via :guilabel:`Preferences->Sharing over the net`. -Now on your iPad/iPhone you have two choices, use either iBooks (version 1.2 and later) or Stanza (version 3.0 and later). Both are available free from the app store. +There are many apps for your iDevice that can connect to |app|. Here we +describe using two of them, iBooks and Stanza. Using Stanza *************** -Now you should be able to access your books on your iPhone by opening Stanza. Go to "Get Books" and then click the "Shared" tab. Under Shared you will see an entry "Books in calibre". If you don't, make sure your iPad/iPhone is connected using the WiFi network in your house, not 3G. If the |app| catalog is still not detected in Stanza, you can add it manually in Stanza. To do this, click the "Shared" tab, then click the "Edit" button and then click "Add book source" to add a new book source. In the Add Book Source screen enter whatever name you like and in the URL field, enter the following:: +You should be able to access your books on your iPhone by opening Stanza. Go to +"Get Books" and then click the "Shared" tab. Under Shared you will see an entry +"Books in calibre". If you don't, make sure your iPad/iPhone is connected using +the WiFi network in your house, not 3G. If the |app| catalog is still not +detected in Stanza, you can add it manually in Stanza. To do this, click the +"Shared" tab, then click the "Edit" button and then click "Add book source" to +add a new book source. In the Add Book Source screen enter whatever name you +like and in the URL field, enter the following:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. Now click "Save" and you are done. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. Now click "Save" +and you are done. -If you get timeout errors while browsing the calibre catalog in Stanza, try increasing the connection timeout value in the stanza settings. Go to Info->Settings and increase the value of Download Timeout. +If you get timeout errors while browsing the calibre catalog in Stanza, try +increasing the connection timeout value in the stanza settings. Go to +Info->Settings and increase the value of Download Timeout. Using iBooks ************** -Start the Safari browser and type in the IP address and port of the computer running the calibre server, like this:: +Start the Safari browser and type in the IP address and port of the computer +running the calibre server, like this:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. -You will see a list of books in Safari, just click on the epub link for whichever book you want to read, Safari will then prompt you to open it with iBooks. +You will see a list of books in Safari, just click on the epub link for +whichever book you want to read, Safari will then prompt you to open it with +iBooks. With the USB cable + iTunes @@ -550,9 +579,23 @@ Yes, you can. Follow the instructions in the answer above for adding custom colu How do I move my |app| library from one computer to another? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder. If the computer you are transferring to already has a calibre installation, then the Welcome wizard wont run. In that case, right-click the |app| icon in the tooolbar and point it to the newly copied directory. You will now have two calibre libraries on your computer and you can switch between them by clicking the |app| icon on the toolbar. Transferring your library in this manner preserver all your metadata, tags, custom columns, etc. +Simply copy the |app| library folder from the old to the new computer. You can +find out what the library folder is by clicking the calibre icon in the +toolbar. The very first item is the path to the library folder. Now on the new +computer, start |app| for the first time. It will run the Welcome Wizard asking +you for the location of the |app| library. Point it to the previously copied +folder. If the computer you are transferring to already has a calibre +installation, then the Welcome wizard wont run. In that case, right-click the +|app| icon in the tooolbar and point it to the newly copied directory. You will +now have two |app| libraries on your computer and you can switch between them +by clicking the |app| icon on the toolbar. Transferring your library in this +manner preserver all your metadata, tags, custom columns, etc. -Note that if you are transferring between different types of computers (for example Windows to OS X) then after doing the above you should also right-click the |app| icon on the tool bar, select Library Maintenance and run the Check Library action. It will warn you about any problems in your library, which you should fix by hand. +Note that if you are transferring between different types of computers (for +example Windows to OS X) then after doing the above you should also right-click +the |app| icon on the tool bar, select Library Maintenance and run the Check +Library action. It will warn you about any problems in your library, which you +should fix by hand. .. note:: A |app| library is just a folder which contains all the book files and their metadata. All the metadata is stored in a single file called metadata.db, in the top level folder. If this file gets corrupted, you may see an empty list of books in |app|. In this case you can ask |app| to restore your books by doing a right-click on the |app| icon in the toolbar and selecting Library Maintenance->Restore Library. @@ -587,7 +630,10 @@ or a Remote Desktop solution. If you must share the actual library, use a file syncing tool like DropBox or rsync or Microsoft SkyDrive instead of a networked drive. Even with these tools there is danger of data corruption/loss, so only do this if you are -willing to live with that risk. +willing to live with that risk. In particular, be aware that **Google Drive** +is incompatible with |app|, if you put your |app| library in Google Drive, you +*will* suffer data loss. See +`this thread <http://www.mobileread.com/forums/showthread.php?t=205581>`_ for details. Content From The Web --------------------- @@ -663,7 +709,7 @@ Post any output you see in a help message on the `Forum <http://www.mobileread.c |app| freezes/crashes occasionally? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There are five possible things I know of, that can cause this: +There are several possible things I know of, that can cause this: * You recently connected an external monitor or TV to your computer. In this case, whenever |app| opens a new window like the edit metadata @@ -671,10 +717,6 @@ There are five possible things I know of, that can cause this: you dont notice it and so you think |app| has frozen. Disconnect your second monitor and restart calibre. - * You are using a Wacom branded USB mouse. There is an incompatibility between - Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom - mouse. - * If you use RoboForm, it is known to cause |app| to crash. Add |app| to the blacklist of programs inside RoboForm to fix this. Or uninstall RoboForm. @@ -685,6 +727,17 @@ There are five possible things I know of, that can cause this: * Constant Guard Protection by Xfinity causes crashes in |app|. You have to manually allow |app| in it or uninstall Constant Guard Protection. + * Spybot - Search & Destroy blocks |app| from accessing its temporary files + breaking viewing and converting of books. + + * You are using a Wacom branded USB mouse. There is an incompatibility between + Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom + mouse. + + * On some 64 bit versions of Windows there are security software/settings + that prevent 64-bit |app| from working properly. If you are using the 64-bit + version of |app| try switching to the 32-bit version. + If none of the above apply to you, then there is some other program on your computer that is interfering with |app|. First reboot your computer in safe mode, to have as few running programs as possible, and see if the crashes still diff --git a/manual/gui.rst b/manual/gui.rst index a51ced54d3..b7cd4230bf 100755 --- a/manual/gui.rst +++ b/manual/gui.rst @@ -531,12 +531,16 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes - Get Books * - :kbd:`I` - Show book details + * - :kbd:`K` + - Edit Table of Contents * - :kbd:`M` - Merge selected records * - :kbd:`Alt+M` - Merge selected records, keeping originals * - :kbd:`O` - Open containing folder + * - :kbd:`P` + - Polish books * - :kbd:`S` - Save to Disk * - :kbd:`V` diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 2224937f3c..dd47af946a 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -3,7 +3,7 @@ import re class Adventure_zone(BasicNewsRecipe): title = u'Adventure Zone' __author__ = 'fenuks' - description = u'Adventure zone - adventure games from A to Z' + description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.' category = 'games' language = 'pl' no_stylesheets = True @@ -11,7 +11,7 @@ class Adventure_zone(BasicNewsRecipe): max_articles_per_feed = 100 cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' index='http://www.adventure-zone.info/fusion/' - use_embedded_content=False + use_embedded_content = False preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: ''), (re.compile(r'</?table.*?>'), lambda match: ''), (re.compile(r'</?tbody.*?>'), lambda match: '')] @@ -21,37 +21,35 @@ class Adventure_zone(BasicNewsRecipe): extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] - def parse_feeds (self): - feeds = BasicNewsRecipe.parse_feeds(self) - soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') - tag=soup.find(name='channel') - titles=[] - for r in tag.findAll(name='image'): - r.extract() - art=tag.findAll(name='item') - for i in art: - titles.append(i.title.string) - for feed in feeds: - for article in feed.articles[:]: - article.title=titles[feed.articles.index(article)] - return feeds - - '''def get_cover_url(self): soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php') cover=soup.find(id='box_OstatninumerAZ') self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] return getattr(self, 'cover_url', self.cover_url)''' - + + def populate_article_metadata(self, article, soup, first): + result = re.search('(.+) - Adventure Zone', soup.title.string) + if result: + result = result.group(1) + else: + result = soup.body.find('strong') + if result: + result = result.string + if result: + result = result.replace('&', '&') + result = result.replace(''', '’') + article.title = result def skip_ad_pages(self, soup): skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) skip_tag = skip_tag.findAll(name='a') - for r in skip_tag: - if r.strong: - word=r.strong.string.lower() - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + title = soup.title.string.lower() + if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): + for r in skip_tag: + if r.strong and r.strong.string: + word=r.strong.string.lower() + if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) def preprocess_html(self, soup): footer=soup.find(attrs={'class':'news-footer middle-border'}) @@ -69,4 +67,4 @@ class Adventure_zone(BasicNewsRecipe): a['href']=self.index + a['href'] return soup - \ No newline at end of file + diff --git a/recipes/antyweb.recipe b/recipes/antyweb.recipe index c2576191dd..b7d3d2583c 100644 --- a/recipes/antyweb.recipe +++ b/recipes/antyweb.recipe @@ -43,6 +43,6 @@ class AntywebRecipe(BasicNewsRecipe): def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) - return soup + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/app_funds.recipe b/recipes/app_funds.recipe index d5734fc451..b0ecbc87b3 100644 --- a/recipes/app_funds.recipe +++ b/recipes/app_funds.recipe @@ -24,4 +24,3 @@ class app_funds(BasicNewsRecipe): auto_cleanup = True feeds = [(u'blog', u'http://feeds.feedburner.com/blogspot/etVI')] - diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe index 6bcc9bef6c..c0fc576c9f 100644 --- a/recipes/archeowiesci.recipe +++ b/recipes/archeowiesci.recipe @@ -1,10 +1,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class Archeowiesci(BasicNewsRecipe): - title = u'Archeowiesci' + title = u'Archeowieści' __author__ = 'fenuks' category = 'archeology' language = 'pl' + description = u'Z pasją o przeszłości' cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' oldest_article = 7 needs_subscription='optional' @@ -29,4 +30,4 @@ class Archeowiesci(BasicNewsRecipe): br['log'] = self.username br['pwd'] = self.password br.submit() - return br \ No newline at end of file + return br diff --git a/recipes/astro_news_pl.recipe b/recipes/astro_news_pl.recipe index 2808fed6e1..b7a15a9809 100644 --- a/recipes/astro_news_pl.recipe +++ b/recipes/astro_news_pl.recipe @@ -2,7 +2,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class AstroNEWS(BasicNewsRecipe): title = u'AstroNEWS' __author__ = 'fenuks' - description = 'AstroNEWS- astronomy every day' + description = u'AstroNEWS regularnie dostarcza wiadomości o wydarzeniach związanych z astronomią i astronautyką. Informujemy o aktualnych odkryciach i wydarzeniach naukowych, zapowiadamy ciekawe zjawiska astronomiczne. Serwis jest częścią portalu astronomicznego AstroNET prowadzonego przez miłośników astronomii i zawodowych astronomów.' category = 'astronomy, science' language = 'pl' oldest_article = 8 diff --git a/recipes/astroflesz.recipe b/recipes/astroflesz.recipe index 0b92fdfa29..745ade420c 100644 --- a/recipes/astroflesz.recipe +++ b/recipes/astroflesz.recipe @@ -13,6 +13,7 @@ class Astroflesz(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + remove_attributes = ['style'] keep_only_tags = [dict(id="k2Container")] remove_tags_after = dict(name='div', attrs={'class':'itemLinks'}) remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})] diff --git a/recipes/astronomia_pl.recipe b/recipes/astronomia_pl.recipe index 89a0e4c889..aa84860976 100644 --- a/recipes/astronomia_pl.recipe +++ b/recipes/astronomia_pl.recipe @@ -3,7 +3,7 @@ import re class Astronomia_pl(BasicNewsRecipe): title = u'Astronomia.pl' __author__ = 'fenuks' - description = 'Astronomia - polish astronomy site' + description = u'Astronomia.pl jest edukacyjnym portalem skierowanym do uczniów, studentów i miłośników astronomii. Przedstawiamy gwiazdy, planety, galaktyki, czarne dziury i wiele innych tajemnic Wszechświata.' masthead_url = 'http://www.astronomia.pl/grafika/logo.gif' cover_url = 'http://www.astronomia.pl/grafika/logo.gif' category = 'astronomy, science' diff --git a/recipes/bachormagazyn.recipe b/recipes/bachormagazyn.recipe new file mode 100644 index 0000000000..fb34552beb --- /dev/null +++ b/recipes/bachormagazyn.recipe @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = u'Łukasz Grąbczewski 2013' +__version__ = '1.0' + +''' +bachormagazyn.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class bachormagazyn(BasicNewsRecipe): + __author__ = u'Łukasz Grączewski' + title = u'Bachor Magazyn' + description = u'Alternatywny magazyn o alternatywach rodzicielstwa' + language = 'pl' + publisher = 'Bachor Mag.' + publication_type = 'magazine' + masthead_url = 'http://bachormagazyn.pl/wp-content/uploads/2011/10/bachor_header1.gif' + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + remove_empty_feeds = True + + oldest_article = 32 #monthly +1 + max_articles_per_feed = 100 + + feeds = [ + (u'Bezradnik dla nieudacznych rodziców', u'http://bachormagazyn.pl/feed/') + ] + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'content'})) + + remove_tags = [] + remove_tags.append(dict(attrs = {'id' : 'nav-above'})) + remove_tags.append(dict(attrs = {'id' : 'nav-below'})) + remove_tags.append(dict(attrs = {'id' : 'comments'})) + remove_tags.append(dict(attrs = {'class' : 'entry-info'})) + remove_tags.append(dict(attrs = {'class' : 'comments-link'})) + remove_tags.append(dict(attrs = {'class' : 'sharedaddy sd-sharing-enabled'})) diff --git a/recipes/badania_net.recipe b/recipes/badania_net.recipe new file mode 100644 index 0000000000..01499f6369 --- /dev/null +++ b/recipes/badania_net.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class BadaniaNet(BasicNewsRecipe): + title = u'badania.net' + __author__ = 'fenuks' + description = u'chcesz wiedzieć więcej?' + category = 'science' + language = 'pl' + cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})] + remove_tags_after = dict(attrs={'class':'omc-single-tags'}) + keep_only_tags = [dict(id='omc-full-article')] + feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] diff --git a/recipes/bankier_pl.recipe b/recipes/bankier_pl.recipe index 8a68d844b3..a9701a80fe 100644 --- a/recipes/bankier_pl.recipe +++ b/recipes/bankier_pl.recipe @@ -47,4 +47,3 @@ class bankier(BasicNewsRecipe): segments = urlPart.split('-') urlPart2 = segments[-1] return 'http://www.bankier.pl/wiadomosci/print.html?article_id=' + urlPart2 - diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe index 4ed59614e7..b772b7c3b4 100644 --- a/recipes/bash_org_pl.recipe +++ b/recipes/bash_org_pl.recipe @@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Bash_org_pl(BasicNewsRecipe): title = u'Bash.org.pl' __author__ = 'fenuks' - description = 'Bash.org.pl - funny quotations from IRC discussions' + description = 'Bash.org.pl - zabawne cytaty z IRC' category = 'funny quotations, humour' language = 'pl' cover_url = u'http://userlogos.org/files/logos/dzikiosiol/none_0.png' @@ -35,8 +35,8 @@ class Bash_org_pl(BasicNewsRecipe): soup=self.index_to_soup(u'http://bash.org.pl/random/') #date=soup.find('div', attrs={'class':'right'}).string url=soup.find('a', attrs={'class':'qid click'}) - title=url.string - url='http://bash.org.pl' +url['href'] + title='' + url='http://bash.org.pl/random/' articles.append({'title' : title, 'url' : url, 'date' : '', @@ -44,9 +44,11 @@ class Bash_org_pl(BasicNewsRecipe): }) return articles + def populate_article_metadata(self, article, soup, first): + article.title = soup.find(attrs={'class':'qid click'}).string def parse_index(self): feeds = [] feeds.append((u"Najnowsze", self.latest_articles())) feeds.append((u"Losowe", self.random_articles())) - return feeds \ No newline at end of file + return feeds diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index 66d4f35e73..c934cc4ac4 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -1,74 +1,87 @@ from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.ebooks.BeautifulSoup import Comment + class BenchmarkPl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' - description = u'benchmark.pl -IT site' + description = u'benchmark.pl, recenzje i testy sprzętu, aktualności, rankingi, sterowniki, porady, opinie' masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' - cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif' + cover_url = 'http://www.benchmark.pl/i/logo-dark.png' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - no_stylesheets=True + no_stylesheets = True + remove_attributes = ['style'] preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')] - keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')] - remove_tags_after=dict(name='div', attrs={'class':'body'}) - remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] - INDEX= 'http://www.benchmark.pl' + keep_only_tags = [dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')] + remove_tags_after = dict(id='article') + remove_tags = [dict(name='div', attrs={'class':['comments', 'body', 'kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs = {'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] + INDEX = 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] def append_page(self, soup, appendtag): - nexturl = soup.find('span', attrs={'class':'next'}) - while nexturl is not None: - nexturl= self.INDEX + nexturl.parent['href'] - soup2 = self.index_to_soup(nexturl) - nexturl=soup2.find('span', attrs={'class':'next'}) + nexturl = soup.find(attrs={'class':'next'}) + while nexturl: + soup2 = self.index_to_soup(nexturl['href']) + nexturl = soup2.find(attrs={'class':'next'}) pagetext = soup2.find(name='div', attrs={'class':'body'}) - appendtag.find('div', attrs={'class':'k_ster'}).extract() + tag = appendtag.find('div', attrs={'class':'k_ster'}) + if tag: + tag.extract() + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - if appendtag.find('div', attrs={'class':'k_ster'}) is not None: + if appendtag.find('div', attrs={'class':'k_ster'}): appendtag.find('div', attrs={'class':'k_ster'}).extract() + for r in appendtag.findAll(attrs={'class':'changePage'}): + r.extract() def image_article(self, soup, appendtag): - nexturl=soup.find('div', attrs={'class':'preview'}) - if nexturl is not None: - nexturl=nexturl.find('a', attrs={'class':'move_next'}) - image=appendtag.find('div', attrs={'class':'preview'}).div['style'][16:] - image=self.INDEX + image[:image.find("')")] + nexturl = soup.find('div', attrs={'class':'preview'}) + if nexturl: + nexturl = nexturl.find('a', attrs={'class':'move_next'}) + image = appendtag.find('div', attrs={'class':'preview'}).div['style'][16:] + image = self.INDEX + image[:image.find("')")] appendtag.find(attrs={'class':'preview'}).name='img' appendtag.find(attrs={'class':'preview'})['src']=image appendtag.find('a', attrs={'class':'move_next'}).extract() - while nexturl is not None: - nexturl= self.INDEX + nexturl['href'] + while nexturl: + nexturl = self.INDEX + nexturl['href'] soup2 = self.index_to_soup(nexturl) - nexturl=soup2.find('a', attrs={'class':'move_next'}) - image=soup2.find('div', attrs={'class':'preview'}).div['style'][16:] - image=self.INDEX + image[:image.find("')")] + nexturl = soup2.find('a', attrs={'class':'move_next'}) + image = soup2.find('div', attrs={'class':'preview'}).div['style'][16:] + image = self.INDEX + image[:image.find("')")] soup2.find(attrs={'class':'preview'}).name='img' soup2.find(attrs={'class':'preview'})['src']=image - pagetext=soup2.find('div', attrs={'class':'gallery'}) + pagetext = soup2.find('div', attrs={'class':'gallery'}) pagetext.find('div', attrs={'class':'title'}).extract() pagetext.find('div', attrs={'class':'thumb'}).extract() pagetext.find('div', attrs={'class':'panelOcenaObserwowane'}).extract() - if nexturl is not None: + if nexturl: pagetext.find('a', attrs={'class':'move_next'}).extract() pagetext.find('a', attrs={'class':'move_back'}).extract() + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - def preprocess_html(self, soup): - if soup.find('div', attrs={'class':'preview'}) is not None: + if soup.find('div', attrs={'class':'preview'}): self.image_article(soup, soup.body) else: self.append_page(soup, soup.body) for a in soup('a'): - if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: - a['href']=self.INDEX + a['href'] + if a.has_key('href') and not a['href'].startswith('http'): + a['href'] = self.INDEX + a['href'] + for r in soup.findAll(attrs={'class':['comments', 'body']}): + r.extract() return soup diff --git a/recipes/biweekly.recipe b/recipes/biweekly.recipe new file mode 100644 index 0000000000..a1bf41f621 --- /dev/null +++ b/recipes/biweekly.recipe @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = u'Łukasz Grąbczewski 2011' +__version__ = '2.0' + +import re, os +from calibre import walk +from calibre.utils.zipfile import ZipFile +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe + +class biweekly(BasicNewsRecipe): + __author__ = u'Łukasz Grąbczewski' + title = 'Biweekly' + language = 'en_PL' + publisher = 'National Audiovisual Institute' + publication_type = 'magazine' + description = u'link with culture [English edition of Polish magazine]: literature, theatre, film, art, music, views, talks' + + conversion_options = { + 'authors' : 'Biweekly.pl' + ,'publisher' : publisher + ,'language' : language + ,'comments' : description + ,'no_default_epub_cover' : True + ,'preserve_cover_aspect_ratio': True + } + + def build_index(self): + browser = self.get_browser() + browser.open('http://www.biweekly.pl/') + + # find the link + epublink = browser.find_link(text_regex=re.compile('ePUB VERSION')) + + # download ebook + self.report_progress(0,_('Downloading ePUB')) + response = browser.follow_link(epublink) + book_file = PersistentTemporaryFile(suffix='.epub') + book_file.write(response.read()) + book_file.close() + + # convert + self.report_progress(0.2,_('Converting to OEB')) + oeb = self.output_dir + '/INPUT/' + if not os.path.exists(oeb): + os.makedirs(oeb) + with ZipFile(book_file.name) as f: + f.extractall(path=oeb) + + for f in walk(oeb): + if f.endswith('.opf'): + return f diff --git a/recipes/blog_biszopa.recipe b/recipes/blog_biszopa.recipe new file mode 100644 index 0000000000..7393f23f3b --- /dev/null +++ b/recipes/blog_biszopa.recipe @@ -0,0 +1,30 @@ +__license__ = 'GPL v3' +from calibre.web.feeds.news import BasicNewsRecipe + +class BlogBiszopa(BasicNewsRecipe): + title = u'Blog Biszopa' + __author__ = 'fenuks' + description = u'Zapiski z Granitowego Miasta' + category = 'history' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://blogbiszopa.pl/wp-content/themes/biszop/images/logo.png' + masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(id='main-content')] + remove_tags = [dict(name='footer')] + #remove_tags_after = {} + #remove_tags_before = {} + + feeds = [(u'Artyku\u0142y', u'http://blogbiszopa.pl/feed/')] + diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe index 4e19fbc6c1..6be7a2ae12 100644 --- a/recipes/cd_action.recipe +++ b/recipes/cd_action.recipe @@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class CD_Action(BasicNewsRecipe): title = u'CD-Action' __author__ = 'fenuks' - description = 'cdaction.pl - polish games magazine site' + description = 'Strona CD-Action (CDA), największego w Polsce pisma dla graczy.Pełne wersje gier, newsy, recenzje, zapowiedzi, konkursy, forum, opinie, galerie screenów,trailery, filmiki, patche, teksty. Gry komputerowe (PC) oraz na konsole (PS3, XBOX 360).' category = 'games' language = 'pl' index='http://www.cdaction.pl' @@ -24,4 +24,4 @@ class CD_Action(BasicNewsRecipe): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + return soup diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe index 7c5138196d..b45f28e4ba 100644 --- a/recipes/ciekawostki_historyczne.recipe +++ b/recipes/ciekawostki_historyczne.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe import re + class Ciekawostki_Historyczne(BasicNewsRecipe): title = u'Ciekawostki Historyczne' oldest_article = 7 @@ -7,42 +8,30 @@ class Ciekawostki_Historyczne(BasicNewsRecipe): description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.' category = 'history' language = 'pl' - masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' - cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + masthead_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + cover_url = 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' max_articles_per_feed = 100 + oldest_article = 140000 preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')] - no_stylesheets=True - remove_empty_feeds=True - keep_only_tags=[dict(name='div', attrs={'class':'post'})] - remove_tags=[dict(id='singlepostinfo')] + no_stylesheets = True + remove_empty_feeds = True + keep_only_tags = [dict(name='div', attrs={'class':'post'})] + recursions = 5 + remove_tags = [dict(id='singlepostinfo')] + feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')] - def append_page(self, soup, appendtag): - tag=soup.find(name='h7') - if tag: - if tag.br: - pass - elif tag.nextSibling.name=='p': - tag=tag.nextSibling - nexturl = tag.findAll('a') - for nextpage in nexturl: - tag.extract() - nextpage= nextpage['href'] - soup2 = self.index_to_soup(nextpage) - pagetext = soup2.find(name='div', attrs={'class':'post'}) - for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}): - r.extract() - for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}): - r.extract() - for r in pagetext.findAll('h1'): - r.extract() - pagetext.find('h6').nextSibling.extract() - pagetext.find('h7').nextSibling.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) + def is_link_wanted(self, url, tag): + return 'ciekawostkihistoryczne' in url and url[-2] in {'2', '3', '4', '5', '6'} - def preprocess_html(self, soup): - self.append_page(soup, soup.body) + def postprocess_html(self, soup, first_fetch): + tag = soup.find('h7') + if tag: + tag.nextSibling.extract() + if not first_fetch: + for r in soup.findAll(['h1']): + r.extract() + soup.find('h6').nextSibling.extract() return soup - - \ No newline at end of file + + diff --git a/recipes/computer_woche.recipe b/recipes/computer_woche.recipe new file mode 100644 index 0000000000..5ee415b6b0 --- /dev/null +++ b/recipes/computer_woche.recipe @@ -0,0 +1,66 @@ +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' +''' +Fetch Computerwoche. +''' + +from calibre.web.feeds.news import BasicNewsRecipe + + +class Computerwoche(BasicNewsRecipe): + + title = 'Computerwoche' + description = 'german computer newspaper' + language = 'de' + __author__ = 'Maria Seliger' + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + linearize_tables = True + no_stylesheets = True + remove_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + html2epub_options = 'base_font_size=10' + summary_length = 100 + auto_cleanup = True + + + extra_css = ''' + h2{font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #003399;} + a{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-style:italic;} + .dachzeile p{font-family:Arial,Helvetica,sans-serif; font-size: x-small; } + h1{ font-family:Arial,Helvetica,sans-serif; font-size:x-large; font-weight:bold;} + .artikelTeaser{font-family:Arial,Helvetica,sans-serif; font-size: x-small; font-weight:bold; } + body{font-family:Arial,Helvetica,sans-serif; } + .photo {font-family:Arial,Helvetica,sans-serif; font-size: x-small; color: #666666;} ''' + + feeds = [ ('Computerwoche', 'http://rss.feedsportal.com/c/312/f/4414/index.rss'), + ('IDG Events', 'http://rss.feedsportal.com/c/401/f/7544/index.rss'), + ('Computerwoche Jobs und Karriere', 'http://rss.feedsportal.com/c/312/f/434082/index.rss'), + ('Computerwoche BI und ECM', 'http://rss.feedsportal.com/c/312/f/434083/index.rss'), + ('Computerwoche Cloud Computing', 'http://rss.feedsportal.com/c/312/f/534647/index.rss'), + ('Computerwoche Compliance und Recht', 'http://rss.feedsportal.com/c/312/f/434084/index.rss'), + ('Computerwoche CRM', 'http://rss.feedsportal.com/c/312/f/434085/index.rss'), + ('Computerwoche Data Center und Server', 'http://rss.feedsportal.com/c/312/f/434086/index.rss'), + ('Computerwoche ERP', 'http://rss.feedsportal.com/c/312/f/434087/index.rss'), + ('Computerwoche IT Macher', 'http://rss.feedsportal.com/c/312/f/534646/index.rss'), + ('Computerwoche IT-Services', 'http://rss.feedsportal.com/c/312/f/434089/index.rss'), + ('Computerwoche IT-Strategie', 'http://rss.feedsportal.com/c/312/f/434090/index.rss'), + ('Computerwoche Mittelstands-IT', 'http://rss.feedsportal.com/c/312/f/434091/index.rss'), + ('Computerwoche Mobile und Wireless', 'http://rss.feedsportal.com/c/312/f/434092/index.rss'), + ('Computerwoche Netzwerk', 'http://rss.feedsportal.com/c/312/f/434093/index.rss'), + ('Computerwoche Notebook und PC', 'http://rss.feedsportal.com/c/312/f/434094/index.rss'), + ('Computerwoche Office und Tools', 'http://rss.feedsportal.com/c/312/f/434095/index.rss'), + ('Computerwoche Security', 'http://rss.feedsportal.com/c/312/f/434098/index.rss'), + ('Computerwoche SOA und BPM', 'http://rss.feedsportal.com/c/312/f/434099/index.rss'), + ('Computerwoche Software Infrastruktur', 'http://rss.feedsportal.com/c/312/f/434096/index.rss'), + ('Computerwoche Storage', 'http://rss.feedsportal.com/c/312/f/534645/index.rss'), + ('Computerwoche VoIP und TK', 'http://rss.feedsportal.com/c/312/f/434102/index.rss'), + ('Computerwoche Web', 'http://rss.feedsportal.com/c/312/f/434103/index.rss'), + ('Computerwoche Home-IT', 'http://rss.feedsportal.com/c/312/f/434104/index.rss')] + + + def print_version(self, url): + return url.replace ('/a/', '/a/print/') + diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 2ec457e4de..6d4e2a179f 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -1,5 +1,5 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - +import re from calibre.web.feeds.news import BasicNewsRecipe class Computerworld_pl(BasicNewsRecipe): title = u'Computerworld.pl' @@ -7,17 +7,21 @@ class Computerworld_pl(BasicNewsRecipe): description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne' category = 'IT' language = 'pl' - masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' - no_stylesheets=True + masthead_url = 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' + cover_url = 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif' + no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 - keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})] - remove_tags_after=dict(name='div', attrs={'class':'rMobi'}) - remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})] + remove_attributes = ['style',] + preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), (re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''),] + keep_only_tags = [dict(id=['szpaltaL', 's2011'])] + remove_tags_after = dict(name='div', attrs={'class':'tresc'}) + remove_tags = [dict(attrs={'class':['nnav', 'rMobi', 'tagi', 'rec']}),] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] - def get_cover_url(self): - soup = self.index_to_soup('http://www.computerworld.pl/') - cover=soup.find(name='img', attrs={'class':'prawo'}) - self.cover_url=cover['src'] - return getattr(self, 'cover_url', self.cover_url) + def skip_ad_pages(self, soup): + if soup.title.string.lower() == 'advertisement': + tag = soup.find(name='a') + if tag: + new_soup = self.index_to_soup(tag['href'], raw=True) + return new_soup \ No newline at end of file diff --git a/recipes/conowego_pl.recipe b/recipes/conowego_pl.recipe index 8b4288ddcd..9b2f6e8200 100644 --- a/recipes/conowego_pl.recipe +++ b/recipes/conowego_pl.recipe @@ -1,14 +1,16 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment + class CoNowegoPl(BasicNewsRecipe): title = u'conowego.pl' __author__ = 'fenuks' description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !' - cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png' + #cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png' category = 'IT, news' language = 'pl' oldest_article = 7 max_articles_per_feed = 100 + INDEX = 'http://www.conowego.pl/' no_stylesheets = True remove_empty_feeds = True use_embedded_content = False @@ -33,6 +35,16 @@ class CoNowegoPl(BasicNewsRecipe): pagetext = soup2.find(attrs={'class':'ni_content'}) pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}): r.extract() + + def get_cover_url(self): + soup = self.index_to_soup('http://www.conowego.pl/magazyn/') + tag = soup.find(attrs={'class':'ms_left'}) + if tag: + self.cover_url = self.INDEX + tag.find('img')['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/czas_gentlemanow.recipe b/recipes/czas_gentlemanow.recipe index 6df677f25f..009cc7e9dd 100644 --- a/recipes/czas_gentlemanow.recipe +++ b/recipes/czas_gentlemanow.recipe @@ -1,4 +1,5 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +import re from calibre.web.feeds.news import BasicNewsRecipe class CzasGentlemanow(BasicNewsRecipe): @@ -13,8 +14,9 @@ class CzasGentlemanow(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True + preprocess_regexps = [(re.compile(u'<h3>Może Cię też zainteresować:</h3>'), lambda m: '')] use_embedded_content = False keep_only_tags = [dict(name='div', attrs={'class':'content'})] - remove_tags = [dict(attrs={'class':'meta_comments'})] - remove_tags_after = dict(name='div', attrs={'class':'fblikebutton_button'}) + remove_tags = [dict(attrs={'class':'meta_comments'}), dict(id=['comments', 'related_posts_thumbnails'])] + remove_tags_after = dict(id='comments') feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')] diff --git a/recipes/deccan_herald.recipe b/recipes/deccan_herald.recipe new file mode 100644 index 0000000000..f2b4b37129 --- /dev/null +++ b/recipes/deccan_herald.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1362501327(BasicNewsRecipe): + title = u'Deccan Herald' + __author__ = 'Muruli Shamanna' + description = 'Daily news from the Deccan Herald' + + oldest_article = 1 + max_articles_per_feed = 100 + auto_cleanup = True + category = 'News' + language = 'en_IN' + encoding = 'utf-8' + publisher = 'The Printers (Mysore) Private Ltd' + ##use_embedded_content = True + + cover_url = 'http://www.quizzing.in/wp-content/uploads/2010/07/DH.gif' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + + feeds = [(u'News', u'http://www.deccanherald.com/rss/news.rss'), (u'Business', u'http://www.deccanherald.com/rss/business.rss'), (u'Entertainment', u'http://www.deccanherald.com/rss/entertainment.rss'), (u'Sports', u'http://www.deccanherald.com/rss/sports.rss'), (u'Environment', u'http://www.deccanherald.com/rss/environment.rss')] + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:150%;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:155%;} + img {max-width:100%; min-width:100%;} + p{font-family:Arial,Helvetica,sans-serif;font-size:large;} + body{font-family:Helvetica,Arial,sans-serif;font-size:medium;} + ''' diff --git a/recipes/democracy_journal.recipe b/recipes/democracy_journal.recipe new file mode 100644 index 0000000000..f02a3b70a8 --- /dev/null +++ b/recipes/democracy_journal.recipe @@ -0,0 +1,27 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class AdvancedUserRecipe1361743898(BasicNewsRecipe): + title = u'Democracy Journal' + description = '''A journal of ideas. Published quarterly.''' + __author__ = u'David Nye' + language = 'en' + oldest_article = 90 + max_articles_per_feed = 30 + no_stylesheets = True + auto_cleanup = True + + def parse_index(self): + articles = [] + feeds = [] + soup = self.index_to_soup("http://www.democracyjournal.org") + for x in soup.findAll(href=re.compile("http://www\.democracyjournal\.org/\d*/.*php$")): + url = x.get('href') + title = self.tag_to_string(x) + articles.append({'title':title, 'url':url, 'description':'', 'date':''}) + feeds.append(('Articles', articles)) + return feeds + + def print_version(self, url): + return url + '?page=all' + diff --git a/recipes/di.recipe b/recipes/di.recipe index 179983e4dd..dad0fdd648 100644 --- a/recipes/di.recipe +++ b/recipes/di.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python -__license__ = 'GPL v3' +__license__ = 'GPL v3' __author__ = 'Mori' __version__ = 'v. 0.5' ''' @@ -11,56 +11,56 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class DziennikInternautowRecipe(BasicNewsRecipe): - __author__ = 'Mori' - language = 'pl' + __author__ = 'Mori' + language = 'pl' - title = u'Dziennik Internautow' - publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.' - description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.' + title = u'Dziennik Internautow' + publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.' + description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.' - max_articles_per_feed = 100 - oldest_article = 7 - cover_url = 'http://di.com.pl/pic/logo_di_norm.gif' - - no_stylesheets = True - remove_javascript = True - encoding = 'utf-8' - - extra_css = ''' - .fotodesc{font-size: 75%;} - .pub_data{font-size: 75%;} - .fotonews{clear: both; padding-top: 10px; padding-bottom: 10px;} - #pub_foto{font-size: 75%; float: left; padding-right: 10px;} - ''' - - feeds = [ - (u'Dziennik Internaut\u00f3w', u'http://feeds.feedburner.com/glowny-di') - ] - - keep_only_tags = [ - dict(name = 'div', attrs = {'id' : 'pub_head'}), - dict(name = 'div', attrs = {'id' : 'pub_content'}) - ] - - remove_tags = [ - dict(name = 'div', attrs = {'class' : 'poradniki_context'}), - dict(name = 'div', attrs = {'class' : 'uniBox'}), - dict(name = 'object', attrs = {}), - dict(name = 'h3', attrs = {}), - dict(attrs={'class':'twitter-share-button'}) - ] - - preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in - [ - (r', <a href="http://di.com.pl/komentarze,.*?</div>', lambda match: '</div>'), - (r'<div class="fotonews".*?">', lambda match: '<div class="fotonews">'), - (r'http://di.com.pl/pic/photo/mini/', lambda match: 'http://di.com.pl/pic/photo/oryginal/'), - (r'\s*</', lambda match: '</'), - ] - ] + max_articles_per_feed = 100 + oldest_article = 7 + cover_url = 'http://di.com.pl/pic/logo_di_norm.gif' - def skip_ad_pages(self, soup): - if 'Advertisement' in soup.title: - nexturl=soup.find('a')['href'] - return self.index_to_soup(nexturl, raw=True) + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + + extra_css = ''' + .fotodesc{font-size: 75%;} + .pub_data{font-size: 75%;} + .fotonews{clear: both; padding-top: 10px; padding-bottom: 10px;} + #pub_foto{font-size: 75%; float: left; padding-right: 10px;} + ''' + + feeds = [ + (u'Dziennik Internaut\u00f3w', u'http://feeds.feedburner.com/glowny-di') + ] + + keep_only_tags = [ + dict(name = 'div', attrs = {'id' : 'pub_head'}), + dict(name = 'div', attrs = {'id' : 'pub_content'}) + ] + + remove_tags = [ + dict(name = 'div', attrs = {'class' : 'poradniki_context'}), + dict(name = 'div', attrs = {'class' : 'uniBox'}), + dict(name = 'object', attrs = {}), + dict(name = 'h3', attrs = {}), + dict(attrs={'class':'twitter-share-button'}) + ] + + preprocess_regexps = [ + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + (r', <a href="http://di.com.pl/komentarze,.*?</div>', lambda match: '</div>'), + (r'<div class="fotonews".*?">', lambda match: '<div class="fotonews">'), + (r'http://di.com.pl/pic/photo/mini/', lambda match: 'http://di.com.pl/pic/photo/oryginal/'), + (r'\s*</', lambda match: '</'), + ] + ] + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) diff --git a/recipes/discover_magazine.recipe b/recipes/discover_magazine.recipe index 02cdb952b5..a7f080bb5f 100644 --- a/recipes/discover_magazine.recipe +++ b/recipes/discover_magazine.recipe @@ -33,6 +33,21 @@ class DiscoverMagazine(BasicNewsRecipe): remove_tags_after = [dict(name='div', attrs={'class':'listingBar'})] + # Login stuff + needs_subscription = True + use_javascript_to_login = True + requires_version = (0, 9, 20) + + def javascript_login(self, br, username, password): + br.visit('http://discovermagazine.com', timeout=120) + f = br.select_form('div.login.section div.form') + f['username'] = username + f['password'] = password + br.submit('input[id="signInButton"]', timeout=120) + br.run_for_a_time(20) + # End login stuff + + def append_page(self, soup, appendtag, position): pager = soup.find('span',attrs={'class':'next'}) if pager: diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index a4e24ac61b..708bdbb017 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -18,7 +18,7 @@ class Dobreprogramy_pl(BasicNewsRecipe): max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ] keep_only_tags=[dict(attrs={'class':['news', 'entry single']})] - remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze')] + remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')] #remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})] feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'), ('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')] diff --git a/recipes/drytooling_pl.recipe b/recipes/drytooling_pl.recipe index bb05e1a25f..2b200aa83b 100644 --- a/recipes/drytooling_pl.recipe +++ b/recipes/drytooling_pl.recipe @@ -8,6 +8,7 @@ class BasicUserRecipe1337668045(BasicNewsRecipe): cover_url = 'http://drytooling.com.pl/images/drytooling-kindle.png' description = u'Drytooling.com.pl jest serwisem wspinaczki zimowej, alpinizmu i himalaizmu. Jeśli uwielbiasz zimę, nie możesz doczekać się aż wyciągniesz szpej z szafki i uderzysz w Tatry, Alpy, czy może Himalaje, to znajdziesz tutaj naprawdę dużo interesujących Cię treści! Zapraszamy!' __author__ = u'Damian Granowski' + language = 'pl' oldest_article = 100 max_articles_per_feed = 20 auto_cleanup = True diff --git a/recipes/dwutygodnik.recipe b/recipes/dwutygodnik.recipe new file mode 100644 index 0000000000..9321d79d80 --- /dev/null +++ b/recipes/dwutygodnik.recipe @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = u'Łukasz Grąbczewski 2011' +__version__ = '2.0' + +import re, os +from calibre import walk +from calibre.utils.zipfile import ZipFile +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe + +class dwutygodnik(BasicNewsRecipe): + __author__ = u'Łukasz Grąbczewski' + title = 'Dwutygodnik' + language = 'pl' + publisher = 'Narodowy Instytut Audiowizualny' + publication_type = 'magazine' + description = u'Strona Kultury: literatura, teatr, film, sztuka, muzyka, felietony, rozmowy' + + conversion_options = { + 'authors' : 'Dwutygodnik.com' + ,'publisher' : publisher + ,'language' : language + ,'comments' : description + ,'no_default_epub_cover' : True + ,'preserve_cover_aspect_ratio': True + } + + def build_index(self): + browser = self.get_browser() + browser.open('http://www.dwutygodnik.com/') + + # find the link + epublink = browser.find_link(text_regex=re.compile('Wersja ePub')) + + # download ebook + self.report_progress(0,_('Downloading ePUB')) + response = browser.follow_link(epublink) + book_file = PersistentTemporaryFile(suffix='.epub') + book_file.write(response.read()) + book_file.close() + + # convert + self.report_progress(0.2,_('Converting to OEB')) + oeb = self.output_dir + '/INPUT/' + if not os.path.exists(oeb): + os.makedirs(oeb) + with ZipFile(book_file.name) as f: + f.extractall(path=oeb) + + for f in walk(oeb): + if f.endswith('.opf'): + return f + diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe index 603591e9f0..50de40354c 100644 --- a/recipes/dzieje_pl.recipe +++ b/recipes/dzieje_pl.recipe @@ -1,9 +1,10 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class Dzieje(BasicNewsRecipe): title = u'dzieje.pl' __author__ = 'fenuks' - description = 'Dzieje - history of Poland' + description = 'Dzieje.pl - najlepszy portal informacyjno-edukacyjny dotyczący historii Polski XX wieku. Archiwalne fotografie, filmy, katalog postaci, quizy i konkursy.' cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png' category = 'history' language = 'pl' @@ -11,8 +12,8 @@ class Dzieje(BasicNewsRecipe): index = 'http://dzieje.pl' oldest_article = 8 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets= True + remove_javascript = True + no_stylesheets = True keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')] remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')] #feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] @@ -28,16 +29,19 @@ class Dzieje(BasicNewsRecipe): pagetext = soup2.find(id='content-area').find(attrs={'class':'content'}) for r in pagetext.findAll(attrs={'class':['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}): r.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + # appendtag.insert(pos, pagetext) tag = soup2.find('li', attrs={'class':'pager-next'}) for r in appendtag.findAll(attrs={'class':['item-list', 'field field-type-computed field-field-tagi', ]}): r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def find_articles(self, url): articles = [] - soup=self.index_to_soup(url) - tag=soup.find(id='content-area').div.div + soup = self.index_to_soup(url) + tag = soup.find(id='content-area').div.div for i in tag.findAll('div', recursive=False): temp = i.find(attrs={'class':'views-field-title'}).span.a title = temp.string @@ -64,7 +68,7 @@ class Dzieje(BasicNewsRecipe): def preprocess_html(self, soup): for a in soup('a'): - if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: - a['href']=self.index + a['href'] + if a.has_key('href') and not a['href'].startswith('http'): + a['href'] = self.index + a['href'] self.append_page(soup, soup.body) - return soup \ No newline at end of file + return soup diff --git a/recipes/dziennik_baltycki.recipe b/recipes/dziennik_baltycki.recipe new file mode 100644 index 0000000000..3cbe3c0968 --- /dev/null +++ b/recipes/dziennik_baltycki.recipe @@ -0,0 +1,34 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class DziennikBaltycki(BasicNewsRecipe): + title = u'Dziennik Ba\u0142tycki' + __author__ = 'fenuks' + description = u'Gazeta Regionalna Dziennik Bałtycki. Najnowsze Wiadomości Trójmiasto i Wiadomości Pomorskie. Czytaj!' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dziennikbaltycki.png?24' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds= True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] + remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] + + feeds = [(u'Wiadomo\u015bci', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_wiadomosci.xml?201302'), (u'Sport', u'http://dziennikbaltycki.feedsportal.com/c/32980/f/533756/index.rss?201302'), (u'Rejsy', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_rejsy.xml?201302'), (u'Biznes na Pomorzu', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_biznesnapomorzu.xml?201302'), (u'GOM', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_gom.xml?201302'), (u'Opinie', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_opinie.xml?201302'), (u'Pitawal Pomorski', u'http://www.dziennikbaltycki.pl/rss/dziennikbaltycki_pitawalpomorski.xml?201302')] + + def print_version(self, url): + return url.replace('artykul', 'drukuj') + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-baltycki/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/dziennik_lodzki.recipe b/recipes/dziennik_lodzki.recipe new file mode 100644 index 0000000000..93a86fdaa2 --- /dev/null +++ b/recipes/dziennik_lodzki.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class DziennikLodzki(BasicNewsRecipe): + title = u'Dziennik \u0141\xf3dzki' + __author__ = 'fenuks' + description = u'Gazeta Regionalna Dziennik Łódzki. Najnowsze Wiadomości Łódź. Czytaj Wiadomości Łódzkie!' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dzienniklodzki.png?24' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] + remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] + + feeds = [(u'Na sygnale', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_nasygnale.xml?201302'), (u'\u0141\xf3d\u017a', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_lodz.xml?201302'), (u'Opinie', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_opinie.xml?201302'), (u'Pieni\u0105dze', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533763/index.rss?201302'), (u'Kultura', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533762/index.rss?201302'), (u'Sport', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533761/index.rss?201302'), (u'Akcje', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_akcje.xml?201302'), (u'M\xf3j Reporter', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_mojreporter.xml?201302'), (u'Studni\xf3wki', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_studniowki.xml?201302'), (u'Kraj', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_kraj.xml?201302'), (u'Zdrowie', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_zdrowie.xml?201302')] + + + def print_version(self, url): + return url.replace('artykul', 'drukuj') + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-lodzki/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe index 5b9cc457f4..44dd596324 100644 --- a/recipes/dziennik_pl.recipe +++ b/recipes/dziennik_pl.recipe @@ -2,6 +2,8 @@ from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.ebooks.BeautifulSoup import Comment + class Dziennik_pl(BasicNewsRecipe): title = u'Dziennik.pl' __author__ = 'fenuks' @@ -9,17 +11,17 @@ class Dziennik_pl(BasicNewsRecipe): category = 'newspaper' language = 'pl' masthead_url= 'http://5.s.dziennik.pl/images/logos.png' - cover_url= 'http://5.s.dziennik.pl/images/logos.png' + cover_url = 'http://5.s.dziennik.pl/images/logos.png' no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 - remove_javascript=True - remove_empty_feeds=True + remove_javascript = True + remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} - extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' + extra_css = 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')] - keep_only_tags=[dict(id='article')] - remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] + keep_only_tags = [dict(id='article')] + remove_tags = [dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})] feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'), (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'), (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'), @@ -34,26 +36,29 @@ class Dziennik_pl(BasicNewsRecipe): (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) + tag = soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'}) if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) + new_soup = self.index_to_soup(tag['href'], raw=True) return new_soup def append_page(self, soup, appendtag): - tag=soup.find('a', attrs={'class':'page_next'}) + tag = soup.find('a', attrs={'class':'page_next'}) if tag: appendtag.find('div', attrs={'class':'article_paginator'}).extract() while tag: - soup2= self.index_to_soup(tag['href']) - tag=soup2.find('a', attrs={'class':'page_next'}) + soup2 = self.index_to_soup(tag['href']) + tag = soup2.find('a', attrs={'class':'page_next'}) if not tag: for r in appendtag.findAll('div', attrs={'class':'art_src'}): r.extract() pagetext = soup2.find(name='div', attrs={'class':'article_body'}) for dictionary in self.remove_tags: - v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs']) + v = pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs']) for delete in v: delete.extract() + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) if appendtag.find('div', attrs={'class':'article_paginator'}): diff --git a/recipes/dziennik_wschodni.recipe b/recipes/dziennik_wschodni.recipe new file mode 100644 index 0000000000..da5d3bb1d9 --- /dev/null +++ b/recipes/dziennik_wschodni.recipe @@ -0,0 +1,84 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +class DziennikWschodni(BasicNewsRecipe): + title = u'Dziennik Wschodni' + __author__ = 'fenuks' + description = u'Dziennik Wschodni - portal regionalny województwa lubelskiego.' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + INDEX = 'http://www.dziennikwschodni.pl' + masthead_url = INDEX + '/images/top_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} + + preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] + remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', + 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', + 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), + dict(attrs={'class':'articleFunctions'})] + + + feeds = [(u'Wszystkie', u'http://www.dziennikwschodni.pl/rss.xml'), + (u'Lublin', u'http://www.dziennikwschodni.pl/lublin.xml'), + (u'Zamość', u'http://www.dziennikwschodni.pl/zamosc.xml'), + (u'Biała Podlaska', u'http://www.dziennikwschodni.pl/biala_podlaska.xml'), + (u'Chełm', u'http://www.dziennikwschodni.pl/chelm.xml'), + (u'Kraśnik', u'http://www.dziennikwschodni.pl/krasnik.xml'), + (u'Puławy', u'http://www.dziennikwschodni.pl/pulawy.xml'), + (u'Świdnik', u'http://www.dziennikwschodni.pl/swidnik.xml'), + (u'Łęczna', u'http://www.dziennikwschodni.pl/leczna.xml'), + (u'Lubartów', u'http://www.dziennikwschodni.pl/lubartow.xml'), + (u'Sport', u'http://www.dziennikwschodni.pl/sport.xml'), + (u'Praca', u'http://www.dziennikwschodni.pl/praca.xml'), + (u'Dom', u'http://www.dziennikwschodni.pl/dom.xml'), + (u'Moto', u'http://www.dziennikwschodni.pl/moto.xml'), + (u'Zdrowie', u'http://www.dziennikwschodni.pl/zdrowie.xml'), + ] + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') + nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] + soup = self.index_to_soup(nexturl) + self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] + return getattr(self, 'cover_url', self.cover_url) + + def append_page(self, soup, appendtag): + tag = soup.find('span', attrs={'class':'photoNavigationPages'}) + if tag: + number = int(tag.string.rpartition('/')[-1].replace(' ', '')) + baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] + + for r in appendtag.findAll(attrs={'class':'photoNavigation'}): + r.extract() + for nr in range(2, number+1): + soup2 = self.index_to_soup(baseurl + str(nr)) + pagetext = soup2.find(id='photoContainer') + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoMeta'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoStoryText'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/dziennik_zachodni.recipe b/recipes/dziennik_zachodni.recipe new file mode 100644 index 0000000000..126c876937 --- /dev/null +++ b/recipes/dziennik_zachodni.recipe @@ -0,0 +1,34 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class DziennikZachodni(BasicNewsRecipe): + title = u'Dziennik Zachodni' + __author__ = 'fenuks' + description = u'Gazeta Regionalna Dziennik Zachodni. Najnowsze Wiadomości Śląskie. Wiadomości Śląsk. Czytaj!' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/dziennikzachodni.png?24' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds= True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] + remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}), dict(attrs={'href':'http://www.dziennikzachodni.pl/piano'})] + + feeds = [(u'Wszystkie', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533764/index.rss?201302'), (u'Wiadomo\u015bci', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533765/index.rss?201302'), (u'Regiony', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_regiony.xml?201302'), (u'Opinie', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_regiony.xml?201302'), (u'Blogi', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_blogi.xml?201302'), (u'Serwisy', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_serwisy.xml?201302'), (u'Sport', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533766/index.rss?201302'), (u'M\xf3j Reporter', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_mojreporter.xml?201302'), (u'Na narty', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_nanarty.xml?201302'), (u'Drogi', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_drogi.xml?201302'), (u'Pieni\u0105dze', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533768/index.rss?201302')] + + def print_version(self, url): + return url.replace('artykul', 'drukuj') + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/dziennik-zachodni/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/echo_dnia.recipe b/recipes/echo_dnia.recipe new file mode 100644 index 0000000000..def87ce0e1 --- /dev/null +++ b/recipes/echo_dnia.recipe @@ -0,0 +1,79 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +class EchoDnia(BasicNewsRecipe): + title = u'Echo Dnia' + __author__ = 'fenuks' + description = u'Echo Dnia - portal regionalny świętokrzyskiego radomskiego i podkarpackiego. Najnowsze wiadomości z Twojego regionu, galerie, video, mp3.' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + INDEX = 'http://www.echodnia.eu' + masthead_url = INDEX + '/images/top_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} + + preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] + remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', + 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', + 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), + dict(attrs={'class':'articleFunctions'})] + + feeds = [(u'Wszystkie', u'http://www.echodnia.eu/rss.xml'), + (u'Świętokrzyskie', u'http://www.echodnia.eu/swietokrzyskie.xml'), + (u'Radomskie', u'http://www.echodnia.eu/radomskie.xml'), + (u'Podkarpackie', u'http://www.echodnia.eu/podkarpackie.xml'), + (u'Sport \u015bwi\u0119tokrzyski', u'http://www.echodnia.eu/sport_swi.xml'), + (u'Sport radomski', u'http://www.echodnia.eu/sport_rad.xml'), + (u'Sport podkarpacki', u'http://www.echodnia.eu/sport_pod.xml'), + (u'Pi\u0142ka no\u017cna', u'http://www.echodnia.eu/pilka.xml'), + (u'Praca', u'http://www.echodnia.eu/praca.xml'), + (u'Dom', u'http://www.echodnia.eu/dom.xml'), + (u'Auto', u'http://www.echodnia.eu/auto.xml'), + (u'Zdrowie', u'http://www.echodnia.eu/zdrowie.xml')] + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') + nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] + soup = self.index_to_soup(nexturl) + self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] + return getattr(self, 'cover_url', self.cover_url) + + def append_page(self, soup, appendtag): + tag = soup.find('span', attrs={'class':'photoNavigationPages'}) + if tag: + number = int(tag.string.rpartition('/')[-1].replace(' ', '')) + baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] + + for r in appendtag.findAll(attrs={'class':'photoNavigation'}): + r.extract() + for nr in range(2, number+1): + soup2 = self.index_to_soup(baseurl + str(nr)) + pagetext = soup2.find(id='photoContainer') + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoMeta'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoStoryText'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/eclicto.recipe b/recipes/eclicto.recipe index 8ef9b8467c..f0192773ec 100644 --- a/recipes/eclicto.recipe +++ b/recipes/eclicto.recipe @@ -1,8 +1,6 @@ #!/usr/bin/env python -__license__ = 'GPL v3' -__author__ = 'Mori' -__version__ = 'v. 0.1' +__license__ = 'GPL v3' ''' blog.eclicto.pl ''' @@ -11,39 +9,39 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class BlogeClictoRecipe(BasicNewsRecipe): - __author__ = 'Mori' - language = 'pl' + __author__ = 'Mori, Tomasz Długosz' + language = 'pl' - title = u'Blog eClicto' - publisher = u'Blog eClicto' - description = u'Blog o e-papierze i e-bookach' + title = u'Blog eClicto' + publisher = u'Blog eClicto' + description = u'Blog o e-papierze i e-bookach' - max_articles_per_feed = 100 - cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif' - - no_stylesheets = True - remove_javascript = True - encoding = 'utf-8' - - extra_css = ''' - img{float: left; padding-right: 10px; padding-bottom: 5px;} - ''' - - feeds = [ - (u'Blog eClicto', u'http://blog.eclicto.pl/feed/') - ] - - remove_tags = [ - dict(name = 'span', attrs = {'id' : 'tags'}) - ] - - remove_tags_after = [ - dict(name = 'div', attrs = {'class' : 'post'}) - ] - - preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in - [ - (r'\s*</', lambda match: '</'), - ] - ] \ No newline at end of file + max_articles_per_feed = 100 + cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif' + + no_stylesheets = True + remove_javascript = True + encoding = 'utf-8' + + extra_css = ''' + img{float: left; padding-right: 10px; padding-bottom: 5px;} + ''' + + feeds = [ + (u'Blog eClicto', u'http://blog.eclicto.pl/feed/') + ] + + remove_tags = [ + dict(name = 'div', attrs = {'class' : 'social_bookmark'}), + ] + + keep_only_tags = [ + dict(name = 'div', attrs = {'class' : 'post'}) + ] + + preprocess_regexps = [ + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + (r'\s*</', lambda match: '</'), + ] + ] diff --git a/recipes/eioba.recipe b/recipes/eioba.recipe index 1df79d64bd..f55138931c 100644 --- a/recipes/eioba.recipe +++ b/recipes/eioba.recipe @@ -4,6 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class eioba(BasicNewsRecipe): title = u'eioba' __author__ = 'fenuks' + description = u'eioba.pl - daj się przeczytać!' cover_url = 'http://www.eioba.org/lay/logo_pl_v3.png' language = 'pl' oldest_article = 7 diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe index 2b0933b58d..21d3b607d2 100644 --- a/recipes/ekologia_pl.recipe +++ b/recipes/ekologia_pl.recipe @@ -15,7 +15,8 @@ class EkologiaPl(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True use_embedded_content = False - remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj']})] + remove_attrs = ['style'] + remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})] feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] diff --git a/recipes/el_malpensante.recipe b/recipes/el_malpensante.recipe new file mode 100644 index 0000000000..7a014735b6 --- /dev/null +++ b/recipes/el_malpensante.recipe @@ -0,0 +1,27 @@ +# coding=utf-8 +# https://github.com/iemejia/calibrecolombia + +''' +http://www.elmalpensante.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ElMalpensante(BasicNewsRecipe): + title = u'El Malpensante' + language = 'es_CO' + __author__ = 'Ismael Mejia <iemejia@gmail.com>' + cover_url = 'http://elmalpensante.com/img/layout/logo.gif' + description = 'El Malpensante' + oldest_article = 30 + simultaneous_downloads = 20 + #tags = 'news, sport, blog' + use_embedded_content = True + remove_empty_feeds = True + max_articles_per_feed = 100 + feeds = [(u'Artículos', u'http://www.elmalpensante.com/articulosRSS.php'), + (u'Malpensantías', u'http://www.elmalpensante.com/malpensantiasRSS.php'), + (u'Margaritas', u'http://www.elmalpensante.com/margaritasRSS.php'), +# This one is almost the same as articulos so we leave articles +# (u'Noticias', u'http://www.elmalpensante.com/noticiasRSS.php'), + ] diff --git a/recipes/elektroda_pl.recipe b/recipes/elektroda_pl.recipe index 34871ea04a..1886c2aecd 100644 --- a/recipes/elektroda_pl.recipe +++ b/recipes/elektroda_pl.recipe @@ -5,7 +5,7 @@ class Elektroda(BasicNewsRecipe): title = u'Elektroda' oldest_article = 8 __author__ = 'fenuks' - description = 'Elektroda.pl' + description = 'Międzynarodowy portal elektroniczny udostępniający bogate zasoby z dziedziny elektroniki oraz forum dyskusyjne.' cover_url = 'http://demotywatory.elektroda.pl/Thunderpic/logo.gif' category = 'electronics' language = 'pl' diff --git a/recipes/elguardian.recipe b/recipes/elguardian.recipe new file mode 100644 index 0000000000..f5d035dd21 --- /dev/null +++ b/recipes/elguardian.recipe @@ -0,0 +1,93 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>' +''' +elguardian.com.ar +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ElGuardian(BasicNewsRecipe): + title = 'El Guardian' + __author__ = 'Darko Miletic' + description = "Semanario con todas las tendencias de un pais" + publisher = 'Editorial Apache SA' + category = 'news,politics,Argentina' + oldest_article = 8 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'es_AR' + remove_empty_feeds = True + publication_type = 'magazine' + issn = '1666-7476' + masthead_url = 'http://elguardian.com.ar/application/templates/frontend/images/home/logo.png' + extra_css = """ + body{font-family: Arial,sans-serif} + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'series' : title + , 'isbn' : issn + } + + keep_only_tags = [dict(attrs={'class':['fotos', 'header_nota', 'nota']})] + remove_tags = [dict(name=['meta','link','iframe','embed','object'])] + remove_attributes = ['lang'] + + feeds = [ + (u'El Pais' , u'http://elguardian.com.ar/RSS/el-pais.xml' ) + ,(u'Columnistas' , u'http://elguardian.com.ar/RSS/columnistas.xml' ) + ,(u'Personajes' , u'http://elguardian.com.ar/RSS/personajes.xml' ) + ,(u'Tinta roja' , u'http://elguardian.com.ar/RSS/tinta-roja.xml' ) + ,(u'Yo fui' , u'http://elguardian.com.ar/RSS/yo-fui.xml' ) + ,(u'Ciencia' , u'http://elguardian.com.ar/RSS/ciencia.xml' ) + ,(u'Cronicas' , u'http://elguardian.com.ar/RSS/cronicas.xml' ) + ,(u'Culturas' , u'http://elguardian.com.ar/RSS/culturas.xml' ) + ,(u'DxT' , u'http://elguardian.com.ar/RSS/dxt.xml' ) + ,(u'Fierros' , u'http://elguardian.com.ar/RSS/fierros.xml' ) + ,(u'Frente fashion', u'http://elguardian.com.ar/RSS/frente-fashion.xml') + ,(u'Pan y vino' , u'http://elguardian.com.ar/RSS/pan-y-vino.xml' ) + ,(u'Turismo' , u'http://elguardian.com.ar/RSS/turismo.xml' ) + ] + + def get_cover_url(self): + soup = self.index_to_soup('http://elguardian.com.ar/') + udata = soup.find('div', attrs={'class':'datosNumero'}) + if udata: + sdata = udata.find('div') + if sdata: + stra = re.findall(r'\d+', self.tag_to_string(sdata)) + self.conversion_options.update({'series_index':int(stra[1])}) + unumero = soup.find('div', attrs={'class':'ultimoNumero'}) + if unumero: + img = unumero.find('img', src=True) + if img: + return img['src'] + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/emuzica_pl.recipe b/recipes/emuzica_pl.recipe index 2fbf9ff514..0b3b207c5e 100644 --- a/recipes/emuzica_pl.recipe +++ b/recipes/emuzica_pl.recipe @@ -12,6 +12,7 @@ class eMuzyka(BasicNewsRecipe): no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100 + remove_attributes = ['style'] keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})] remove_tags=[dict(name='span', attrs={'id':'date'})] feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')] @@ -20,4 +21,4 @@ class eMuzyka(BasicNewsRecipe): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + return soup diff --git a/recipes/esenja.recipe b/recipes/esenja.recipe index b8b94ad66e..503b27b9fa 100644 --- a/recipes/esenja.recipe +++ b/recipes/esenja.recipe @@ -3,85 +3,153 @@ __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' -from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment class Esensja(BasicNewsRecipe): - title = u'Esensja' - __author__ = 'matek09' - description = 'Monthly magazine' - encoding = 'utf-8' - no_stylesheets = True - language = 'pl' - remove_javascript = True - HREF = '0' + title = u'Esensja' + __author__ = 'matek09 & fenuks' + description = 'Magazyn kultury popularnej' + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + remove_javascript = True + masthead_url = 'http://esensja.pl/img/wrss.gif' + oldest_article = 1 + URL = 'http://esensja.pl' + HREF = '0' + remove_attributes = ['style', 'bgcolor', 'alt', 'color'] + keep_only_tags = [dict(attrs={'class':'sekcja'}), ] + #keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}) + #remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'})) + remove_tags_after = dict(id='tekst') - #keep_only_tags =[] - #keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}) - remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'})) - remove_tags_after = dict(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'})) + remove_tags = [dict(name = 'img', attrs = {'src' : ['../../../2000/01/img/tab_top.gif', '../../../2000/01/img/tab_bot.gif']}), + dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}), + #dict(attrs={'rel':'lightbox[galeria]'}) + dict(attrs={'class':['tekst_koniec', 'ref', 'wykop']}), + dict(attrs={'itemprop':['copyrightHolder', 'publisher']}), + dict(id='komentarze') + + ] - remove_tags =[] - remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_top.gif'})) - remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 't-title2 nextpage'})) + extra_css = ''' + .t-title {font-size: x-large; font-weight: bold; text-align: left} + .t-author {font-size: x-small; text-align: left} + .t-title2 {font-size: x-small; font-style: italic; text-align: left} + .text {font-size: small; text-align: left} + .annot-ref {font-style: italic; text-align: left} + ''' - extra_css = ''' - .t-title {font-size: x-large; font-weight: bold; text-align: left} - .t-author {font-size: x-small; text-align: left} - .t-title2 {font-size: x-small; font-style: italic; text-align: left} - .text {font-size: small; text-align: left} - .annot-ref {font-style: italic; text-align: left} - ''' + preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), + (re.compile(ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), + ] - preprocess_regexps = [(re.compile(r'alt="[^"]*"'), - lambda match: '')] + def parse_index(self): + soup = self.index_to_soup('http://www.esensja.pl/magazyn/') + a = soup.find('a', attrs={'href' : re.compile('.*/index.html')}) + year = a['href'].split('/')[0] + month = a['href'].split('/')[1] + self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/' + soup = self.index_to_soup(self.HREF + '01.html') + self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg' + feeds = [] + chapter = '' + subchapter = '' + articles = [] + intro = soup.find('div', attrs={'class' : 'n-title'}) + ''' + introduction = {'title' : self.tag_to_string(intro.a), + 'url' : self.HREF + intro.a['href'], + 'date' : '', + 'description' : ''} + chapter = 'Wprowadzenie' + articles.append(introduction) + ''' + + for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}): + if tag.name in 'td': + if len(articles) > 0: + section = chapter + if len(subchapter) > 0: + section += ' - ' + subchapter + feeds.append((section, articles)) + articles = [] + if tag['class'] == 'chapter': + chapter = self.tag_to_string(tag).capitalize() + subchapter = '' + else: + subchapter = self.tag_to_string(tag) + subchapter = self.tag_to_string(tag) + continue + + finalurl = tag.a['href'] + if not finalurl.startswith('http'): + finalurl = self.HREF + finalurl + articles.append({'title' : self.tag_to_string(tag.a), 'url' : finalurl, 'date' : '', 'description' : ''}) + + a = self.index_to_soup(finalurl) + i = 1 + + while True: + div = a.find('div', attrs={'class' : 't-title2 nextpage'}) + if div is not None: + link = div.a['href'] + if not link.startswith('http'): + link = self.HREF + link + a = self.index_to_soup(link) + articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : link, 'date' : '', 'description' : ''}) + i = i + 1 + else: + break - def parse_index(self): - soup = self.index_to_soup('http://www.esensja.pl/magazyn/') - a = soup.find('a', attrs={'href' : re.compile('.*/index.html')}) - year = a['href'].split('/')[0] - month = a['href'].split('/')[1] - self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/' - soup = self.index_to_soup(self.HREF + '01.html') - self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg' - feeds = [] - intro = soup.find('div', attrs={'class' : 'n-title'}) - introduction = {'title' : self.tag_to_string(intro.a), - 'url' : self.HREF + intro.a['href'], - 'date' : '', - 'description' : ''} - chapter = 'Wprowadzenie' - subchapter = '' - articles = [] - articles.append(introduction) - for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}): - if tag.name in 'td': - if len(articles) > 0: - section = chapter - if len(subchapter) > 0: - section += ' - ' + subchapter - feeds.append((section, articles)) - articles = [] - if tag['class'] == 'chapter': - chapter = self.tag_to_string(tag).capitalize() - subchapter = '' - else: - subchapter = self.tag_to_string(tag) - subchapter = self.tag_to_string(tag) - continue - articles.append({'title' : self.tag_to_string(tag.a), 'url' : self.HREF + tag.a['href'], 'date' : '', 'description' : ''}) + return feeds - a = self.index_to_soup(self.HREF + tag.a['href']) - i = 1 - while True: - div = a.find('div', attrs={'class' : 't-title2 nextpage'}) - if div is not None: - a = self.index_to_soup(self.HREF + div.a['href']) - articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : self.HREF + div.a['href'], 'date' : '', 'description' : ''}) - i = i + 1 - else: - break + def append_page(self, soup, appendtag): + r = appendtag.find(attrs={'class':'wiecej_xxx'}) + if r: + nr = r.findAll(attrs={'class':'tn-link'})[-1] + try: + nr = int(nr.a.string) + except: + return + baseurl = soup.find(attrs={'property':'og:url'})['content'] + '&strona={0}' + for number in range(2, nr+1): + soup2 = self.index_to_soup(baseurl.format(number)) + pagetext = soup2.find(attrs={'class':'tresc'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['wiecej_xxx', 'tekst_koniec']}): + r.extract() + for r in appendtag.findAll('script'): + r.extract() + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + for tag in soup.findAll(attrs={'class':'img_box_right'}): + temp = tag.find('img') + src = '' + if temp: + src = temp.get('src', '') + for r in tag.findAll('a', recursive=False): + r.extract() + info = tag.find(attrs={'class':'img_info'}) + text = str(tag) + if not src: + src = re.search('src="[^"]*?"', text) + if src: + src = src.group(0) + src = src[5:].replace('//', '/') + if src: + tag.contents = [] + tag.insert(0, BeautifulSoup('<img src="{0}{1}" />'.format(self.URL, src))) + if info: + tag.insert(len(tag.contents), info) + return soup - return feeds diff --git a/recipes/esensja_(rss).recipe b/recipes/esensja_(rss).recipe new file mode 100644 index 0000000000..af23ea58a9 --- /dev/null +++ b/recipes/esensja_(rss).recipe @@ -0,0 +1,109 @@ +__license__ = 'GPL v3' +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment + +class EsensjaRSS(BasicNewsRecipe): + title = u'Esensja (RSS)' + __author__ = 'fenuks' + description = u'Magazyn kultury popularnej' + category = 'reading, fantasy, reviews, boardgames, culture' + #publication_type = '' + language = 'pl' + encoding = 'utf-8' + INDEX = 'http://www.esensja.pl' + extra_css = '''.t-title {font-size: x-large; font-weight: bold; text-align: left} + .t-author {font-size: x-small; text-align: left} + .t-title2 {font-size: x-small; font-style: italic; text-align: left} + .text {font-size: small; text-align: left} + .annot-ref {font-style: italic; text-align: left} + ''' + cover_url = '' + masthead_url = 'http://esensja.pl/img/wrss.gif' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + ignore_duplicate_articles = {'title', 'url'} + preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), + (re.compile(ur'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), + ] + remove_attributes = ['style', 'bgcolor', 'alt', 'color'] + keep_only_tags = [dict(attrs={'class':'sekcja'}), ] + remove_tags_after = dict(id='tekst') + + remove_tags = [dict(name = 'img', attrs = {'src' : ['../../../2000/01/img/tab_top.gif', '../../../2000/01/img/tab_bot.gif']}), + dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}), + #dict(attrs={'rel':'lightbox[galeria]'}) + dict(attrs={'class':['tekst_koniec', 'ref', 'wykop']}), + dict(attrs={'itemprop':['copyrightHolder', 'publisher']}), + dict(id='komentarze') + ] + + feeds = [(u'Książka', u'http://esensja.pl/rss/ksiazka.rss'), + (u'Film', u'http://esensja.pl/rss/film.rss'), + (u'Komiks', u'http://esensja.pl/rss/komiks.rss'), + (u'Gry', u'http://esensja.pl/rss/gry.rss'), + (u'Muzyka', u'http://esensja.pl/rss/muzyka.rss'), + (u'Twórczość', u'http://esensja.pl/rss/tworczosc.rss'), + (u'Varia', u'http://esensja.pl/rss/varia.rss'), + (u'Zgryźliwi Tetrycy', u'http://esensja.pl/rss/tetrycy.rss'), + (u'Nowe książki', u'http://esensja.pl/rss/xnowosci.rss'), + (u'Ostatnio dodane książki', u'http://esensja.pl/rss/xdodane.rss'), + ] + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX) + cover = soup.find(id='panel_1') + self.cover_url = self.INDEX + cover.find('a')['href'].replace('index.html', '') + 'img/ilustr/cover_b.jpg' + return getattr(self, 'cover_url', self.cover_url) + + + def append_page(self, soup, appendtag): + r = appendtag.find(attrs={'class':'wiecej_xxx'}) + if r: + nr = r.findAll(attrs={'class':'tn-link'})[-1] + try: + nr = int(nr.a.string) + except: + return + baseurl = soup.find(attrs={'property':'og:url'})['content'] + '&strona={0}' + for number in range(2, nr+1): + soup2 = self.index_to_soup(baseurl.format(number)) + pagetext = soup2.find(attrs={'class':'tresc'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['wiecej_xxx', 'tekst_koniec']}): + r.extract() + for r in appendtag.findAll('script'): + r.extract() + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + for tag in soup.findAll(attrs={'class':'img_box_right'}): + temp = tag.find('img') + src = '' + if temp: + src = temp.get('src', '') + for r in tag.findAll('a', recursive=False): + r.extract() + info = tag.find(attrs={'class':'img_info'}) + text = str(tag) + if not src: + src = re.search('src="[^"]*?"', text) + if src: + src = src.group(0) + src = src[5:].replace('//', '/') + if src: + tag.contents = [] + tag.insert(0, BeautifulSoup('<img src="{0}{1}" />'.format(self.INDEX, src))) + if info: + tag.insert(len(tag.contents), info) + return soup diff --git a/recipes/eso_pl.recipe b/recipes/eso_pl.recipe new file mode 100644 index 0000000000..5ebb420396 --- /dev/null +++ b/recipes/eso_pl.recipe @@ -0,0 +1,23 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ESO(BasicNewsRecipe): + title = u'ESO PL' + __author__ = 'fenuks' + description = u'ESO, Europejskie Obserwatorium Południowe, buduje i obsługuje najbardziej zaawansowane naziemne teleskopy astronomiczne na świecie' + category = 'astronomy' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1922519424/eso-twitter-logo.png' + keep_only_tags = [dict(attrs={'class':'subcl'})] + remove_tags = [dict(id='lang_row'), dict(attrs={'class':['pr_typeid', 'pr_news_feature_link', 'outreach_usage', 'hidden']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.eso.org/public/poland/news/feed/'), (u'Og\u0142oszenia', u'http://www.eso.org/public/poland/announcements/feed/'), (u'Zdj\u0119cie tygodnia', u'http://www.eso.org/public/poland/images/potw/feed/')] + + def preprocess_html(self, soup): + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://www.eso.org' + a['href'] + return soup diff --git a/recipes/f1_ultra.recipe b/recipes/f1_ultra.recipe index ada82542fc..4723167c17 100644 --- a/recipes/f1_ultra.recipe +++ b/recipes/f1_ultra.recipe @@ -20,16 +20,16 @@ class f1ultra(BasicNewsRecipe): remove_tags.append(dict(attrs = {'title' : ['PDF', 'Drukuj', 'Email']})) remove_tags.append(dict(name = 'form', attrs = {'method' : 'post'})) remove_tags.append(dict(name = 'hr', attrs = {'size' : '2'})) - + preprocess_regexps = [(re.compile(r'align="left"'), lambda match: ''), - (re.compile(r'align="right"'), lambda match: ''), - (re.compile(r'width=\"*\"'), lambda match: ''), - (re.compile(r'\<table .*?\>'), lambda match: '')] - + (re.compile(r'align="right"'), lambda match: ''), + (re.compile(r'width=\"*\"'), lambda match: ''), + (re.compile(r'\<table .*?\>'), lambda match: '')] + extra_css = '''.contentheading { font-size: 1.4em; font-weight: bold; } - img { display: block; clear: both;} - ''' + img { display: block; clear: both;} + ''' remove_attributes = ['width','height','position','float','padding-left','padding-right','padding','text-align'] feeds = [(u'F1 Ultra', u'http://www.f1ultra.pl/index.php?option=com_rd_rss&id=1&Itemid=245')] diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 6b014e8f93..3a86438d1c 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -1,24 +1,27 @@ -from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup + class FilmWebPl(BasicNewsRecipe): title = u'FilmWeb' __author__ = 'fenuks' - description = 'FilmWeb - biggest polish movie site' - cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png' + description = 'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy. Największa baza filmów, seriali i aktorów, repertuar kin i tv, ...' + cover_url = 'http://gfx.filmweb.pl/n/logo-filmweb-bevel.jpg' category = 'movies' language = 'pl' - index='http://www.filmweb.pl' + index = 'http://www.filmweb.pl' + #extra_css = '.MarkupPhotoHTML-7 {float:left; margin-right: 10px;}' oldest_article = 8 max_articles_per_feed = 100 - no_stylesheets= True - remove_empty_feeds=True + no_stylesheets = True + remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} - preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] + remove_javascript = True + preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), (re.compile(ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />')]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' - remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] + remove_tags = [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] remove_attributes = ['style',] - keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] + keep_only_tags = [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), (u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'), @@ -41,7 +44,12 @@ class FilmWebPl(BasicNewsRecipe): skip_tag = soup.find('a', attrs={'class':'welcomeScreenButton'}) if skip_tag is not None: return self.index_to_soup(skip_tag['href'], raw=True) - + + def postprocess_html(self, soup, first_fetch): + for r in soup.findAll(attrs={'class':'singlephoto'}): + r['style'] = 'float:left; margin-right: 10px;' + return soup + def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: @@ -56,4 +64,8 @@ class FilmWebPl(BasicNewsRecipe): tag.name = 'div' for t in tag.findAll('li'): t.name = 'div' + for r in soup.findAll(id=re.compile('photo-\d+')): + r.extract() + for r in soup.findAll(style=re.compile('float: ?left')): + r['class'] = 'singlephoto' return soup diff --git a/recipes/financial_times_us.recipe b/recipes/financial_times_us.recipe new file mode 100644 index 0000000000..3821e5ea0e --- /dev/null +++ b/recipes/financial_times_us.recipe @@ -0,0 +1,182 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>' +''' +http://www.ft.com/intl/us-edition +''' + +import datetime +from calibre.ptempfile import PersistentTemporaryFile +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class FinancialTimes(BasicNewsRecipe): + title = 'Financial Times (US) printed edition' + __author__ = 'Darko Miletic' + description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." + publisher = 'The Financial Times Ltd.' + category = 'news, finances, politics, UK, World' + oldest_article = 2 + language = 'en' + max_articles_per_feed = 250 + no_stylesheets = True + use_embedded_content = False + needs_subscription = True + encoding = 'utf8' + publication_type = 'newspaper' + articles_are_obfuscated = True + temp_files = [] + masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' + LOGIN = 'https://registration.ft.com/registration/barrier/login' + LOGIN2 = 'http://media.ft.com/h/subs3.html' + INDEX = 'http://www.ft.com/intl/us-edition' + PREFIX = 'http://www.ft.com' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.open(self.INDEX) + if self.username is not None and self.password is not None: + br.open(self.LOGIN2) + br.select_form(name='loginForm') + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + + keep_only_tags = [ + dict(name='div' , attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) + ,dict(name='div' , attrs={'class':'standfirst'}) + ,dict(name='div' , attrs={'id' :'storyContent'}) + ,dict(name='div' , attrs={'class':['ft-story-body','index-detail']}) + ,dict(name='h2' , attrs={'class':'entry-title'} ) + ,dict(name='span', attrs={'class':lambda x: x and 'posted-on' in x.split()} ) + ,dict(name='span', attrs={'class':'author_byline'} ) + ,dict(name='div' , attrs={'class':'entry-content'} ) + ] + remove_tags = [ + dict(name='div', attrs={'id':'floating-con'}) + ,dict(name=['meta','iframe','base','object','embed','link']) + ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image']}) + ] + remove_attributes = ['width','height','lang'] + + extra_css = """ + body{font-family: Georgia,Times,"Times New Roman",serif} + h2{font-size:large} + .ft-story-header{font-size: x-small} + .container{font-size:x-small;} + h3{font-size:x-small;color:#003399;} + .copyright{font-size: x-small} + img{margin-top: 0.8em; display: block} + .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small} + .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} + """ + + def get_artlinks(self, elem): + articles = [] + count = 0 + for item in elem.findAll('a',href=True): + count = count + 1 + if self.test and count > 2: + return articles + rawlink = item['href'] + url = rawlink + if not rawlink.startswith('http://'): + url = self.PREFIX + rawlink + try: + urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. + except: + continue + title = self.tag_to_string(item) + date = strftime(self.timefmt) + articles.append({ + 'title' :title + ,'date' :date + ,'url' :urlverified + ,'description':'' + }) + return articles + + def parse_index(self): + feeds = [] + soup = self.index_to_soup(self.INDEX) + dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div')) + self.timefmt = ' [%s]'%dates + wide = soup.find('div',attrs={'class':'wide'}) + if not wide: + return feeds + allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()}) + if not allsections: + return feeds + count = 0 + for item in allsections: + count = count + 1 + if self.test and count > 2: + return feeds + fitem = item.h3 + if not fitem: + fitem = item.h4 + ftitle = self.tag_to_string(fitem) + self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) + feedarts = self.get_artlinks(item.ul) + feeds.append((ftitle,feedarts)) + return feeds + + def preprocess_html(self, soup): + items = ['promo-box','promo-title', + 'promo-headline','promo-image', + 'promo-intro','promo-link','subhead'] + for item in items: + for it in soup.findAll(item): + it.name = 'div' + it.attrs = [] + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup + + def get_cover_url(self): + cdate = datetime.date.today() + if cdate.isoweekday() == 7: + cdate -= datetime.timedelta(days=1) + return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_USA.pdf') + + def get_obfuscated_article(self, url): + count = 0 + while (count < 10): + try: + response = self.browser.open(url) + html = response.read() + count = 10 + except: + print "Retrying download..." + count += 1 + tfile = PersistentTemporaryFile('_fa.html') + tfile.write(html) + tfile.close() + self.temp_files.append(tfile) + return tfile.name + + def cleanup(self): + self.browser.open('https://registration.ft.com/registration/login/logout?location=') \ No newline at end of file diff --git a/recipes/focus_pl.recipe b/recipes/focus_pl.recipe index 1954fd7803..66864b8561 100644 --- a/recipes/focus_pl.recipe +++ b/recipes/focus_pl.recipe @@ -13,7 +13,7 @@ class FocusRecipe(BasicNewsRecipe): title = u'Focus' publisher = u'Gruner + Jahr Polska' category = u'News' - description = u'Newspaper' + description = u'Focus.pl - pierwszy w Polsce portal społecznościowy dla miłośników nauki. Tematyka: nauka, historia, cywilizacja, technika, przyroda, sport, gadżety' category = 'magazine' cover_url = '' remove_empty_feeds = True diff --git a/recipes/fotoblogia_pl.recipe b/recipes/fotoblogia_pl.recipe index 99df46419a..a482390e0c 100644 --- a/recipes/fotoblogia_pl.recipe +++ b/recipes/fotoblogia_pl.recipe @@ -3,6 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Fotoblogia_pl(BasicNewsRecipe): title = u'Fotoblogia.pl' __author__ = 'fenuks' + description = u'Jeden z największych polskich blogów o fotografii.' category = 'photography' language = 'pl' masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg' @@ -11,6 +12,6 @@ class Fotoblogia_pl(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})] + keep_only_tags=[dict(name='div', attrs={'class':['post-view post-standard', 'photo-container']})] remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})] feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')] diff --git a/recipes/frazpc.recipe b/recipes/frazpc.recipe index b5225e33d5..7d1cb329f8 100644 --- a/recipes/frazpc.recipe +++ b/recipes/frazpc.recipe @@ -18,9 +18,10 @@ class FrazPC(BasicNewsRecipe): max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True + remove_empty_feeds = True cover_url='http://www.frazpc.pl/images/logo.png' feeds = [ - (u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'), + (u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'), (u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly') ] diff --git a/recipes/fronda.recipe b/recipes/fronda.recipe index 6755770329..6ed5d052a3 100644 --- a/recipes/fronda.recipe +++ b/recipes/fronda.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = u'2010-2012, Tomasz Dlugosz <tomek3d@gmail.com>' +__copyright__ = u'2010-2013, Tomasz Dlugosz <tomek3d@gmail.com>' ''' fronda.pl ''' @@ -23,7 +23,6 @@ class Fronda(BasicNewsRecipe): extra_css = ''' h1 {font-size:150%} .body {text-align:left;} - div.headline {font-weight:bold} ''' earliest_date = date.today() - timedelta(days=oldest_article) @@ -69,10 +68,11 @@ class Fronda(BasicNewsRecipe): article_url = 'http://www.fronda.pl' + article_a['href'] article_title = self.tag_to_string(article_a) articles[genName].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date }) - feeds.append((genName, articles[genName])) + if articles[genName]: + feeds.append((genName, articles[genName])) return feeds - keep_only_tags = [ + keep_only_tags = [ dict(name='div', attrs={'class':'yui-g'}) ] @@ -83,6 +83,10 @@ class Fronda(BasicNewsRecipe): dict(name='h3', attrs={'class':'block-header article comments'}), dict(name='ul', attrs={'class':'comment-list'}), dict(name='ul', attrs={'class':'category'}), + dict(name='ul', attrs={'class':'tag-list'}), dict(name='p', attrs={'id':'comments-disclaimer'}), + dict(name='div', attrs={'style':'text-align: left; margin-bottom: 15px;'}), + dict(name='div', attrs={'style':'text-align: left; margin-top: 15px; margin-bottom: 30px;'}), + dict(name='div', attrs={'class':'related-articles content'}), dict(name='div', attrs={'id':'comment-form'}) ] diff --git a/recipes/gazeta_krakowska.recipe b/recipes/gazeta_krakowska.recipe new file mode 100644 index 0000000000..3abbcfdf39 --- /dev/null +++ b/recipes/gazeta_krakowska.recipe @@ -0,0 +1,34 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class GazetaKrakowska(BasicNewsRecipe): + title = u'Gazeta Krakowska' + __author__ = 'fenuks' + description = u'Gazeta Regionalna Gazeta Krakowska. Najnowsze Wiadomości Kraków. Informacje Kraków. Czytaj!' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gazetakrakowska.png?24' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] + remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] + + feeds = [(u'Fakty24', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533770/index.rss?201302'), (u'Krak\xf3w', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_krakow.xml?201302'), (u'Tarn\xf3w', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_tarnow.xml?201302'), (u'Nowy S\u0105cz', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_nsacz.xml?201302'), (u'Ma\u0142. Zach.', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_malzach.xml?201302'), (u'Podhale', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_podhale.xml?201302'), (u'Sport', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533771/index.rss?201302'), (u'Kultura', u'http://gazetakrakowska.feedsportal.com/c/32980/f/533772/index.rss?201302'), (u'Opinie', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_opinie.xml?201302'), (u'Magazyn', u'http://www.gazetakrakowska.pl/rss/gazetakrakowska_magazyn.xml?201302')] + + def print_version(self, url): + return url.replace('artykul', 'drukuj') + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/gazeta-krakowska/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/gazeta_lubuska.recipe b/recipes/gazeta_lubuska.recipe new file mode 100644 index 0000000000..f2a42b63b8 --- /dev/null +++ b/recipes/gazeta_lubuska.recipe @@ -0,0 +1,69 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +class GazetaLubuska(BasicNewsRecipe): + title = u'Gazeta Lubuska' + __author__ = 'fenuks' + description = u'Gazeta Lubuska - portal regionalny województwa lubuskiego.' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + INDEX = 'http://www.gazetalubuska.pl' + masthead_url = INDEX + '/images/top_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} + + preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] + remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', + 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', + 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), + dict(attrs={'class':'articleFunctions'})] + + feeds = [(u'Wszystkie', u'http://www.gazetalubuska.pl/rss.xml'), (u'Dreznenko', u'http://www.gazetalubuska.pl/drezdenko.xml'), (u'G\u0142og\xf3w', u'http://www.gazetalubuska.pl/glogow.xml'), (u'Gorz\xf3w Wielkopolski', u'http://www.gazetalubuska.pl/gorzow-wielkopolski.xml'), (u'Gubin', u'http://www.gazetalubuska.pl/gubin.xml'), (u'Kostrzyn', u'http://www.gazetalubuska.pl/kostrzyn.xml'), (u'Krosno Odrza\u0144skie', u'http://www.gazetalubuska.pl/krosno-odrzanskie.xml'), (u'Lubsko', u'http://www.gazetalubuska.pl/lubsko.xml'), (u'Mi\u0119dzych\xf3d', u'http://www.gazetalubuska.pl/miedzychod.xml'), (u'Mi\u0119dzyrzecz', u'http://www.gazetalubuska.pl/miedzyrzecz.xml'), (u'Nowa S\xf3l', u'http://www.gazetalubuska.pl/nowa-sol.xml'), (u'S\u0142ubice', u'http://www.gazetalubuska.pl/slubice.xml'), (u'Strzelce Kraje\u0144skie', u'http://www.gazetalubuska.pl/strzelce-krajenskie.xml'), (u'Sulech\xf3w', u'http://www.gazetalubuska.pl/sulechow.xml'), (u'Sul\u0119cin', u'http://www.gazetalubuska.pl/sulecin.xml'), (u'\u015awi\u0119bodzin', u'http://www.gazetalubuska.pl/swiebodzin.xml'), (u'Wolsztyn', u'http://www.gazetalubuska.pl/wolsztyn.xml'), (u'Wschowa', u'http://www.gazetalubuska.pl/wschowa.xml'), (u'Zielona G\xf3ra', u'http://www.gazetalubuska.pl/zielona-gora.xml'), (u'\u017baga\u0144', u'http://www.gazetalubuska.pl/zagan.xml'), (u'\u017bary', u'http://www.gazetalubuska.pl/zary.xml'), (u'Sport', u'http://www.gazetalubuska.pl/sport.xml'), (u'Auto', u'http://www.gazetalubuska.pl/auto.xml'), (u'Dom', u'http://www.gazetalubuska.pl/dom.xml'), (u'Praca', u'http://www.gazetalubuska.pl/praca.xml'), (u'Zdrowie', u'http://www.gazetalubuska.pl/zdrowie.xml')] + + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') + nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] + soup = self.index_to_soup(nexturl) + self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] + return getattr(self, 'cover_url', self.cover_url) + + def append_page(self, soup, appendtag): + tag = soup.find('span', attrs={'class':'photoNavigationPages'}) + if tag: + number = int(tag.string.rpartition('/')[-1].replace(' ', '')) + baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] + + for r in appendtag.findAll(attrs={'class':'photoNavigation'}): + r.extract() + for nr in range(2, number+1): + soup2 = self.index_to_soup(baseurl + str(nr)) + pagetext = soup2.find(id='photoContainer') + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoMeta'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoStoryText'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe index 59188a5d6a..59b3b00933 100644 --- a/recipes/gazeta_pl_krakow.recipe +++ b/recipes/gazeta_pl_krakow.recipe @@ -49,8 +49,8 @@ class gw_krakow(BasicNewsRecipe): feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: + tag=soup.find(name='a', attrs={'class':'btn'}) + if tag: new_soup=self.index_to_soup(tag['href'], raw=True) return new_soup @@ -95,8 +95,7 @@ class gw_krakow(BasicNewsRecipe): rem.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup - + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe index 2d95bcc06f..9e10a0610c 100644 --- a/recipes/gazeta_pl_warszawa.recipe +++ b/recipes/gazeta_pl_warszawa.recipe @@ -46,8 +46,8 @@ class gw_wawa(BasicNewsRecipe): feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: + tag=soup.find(name='a', attrs={'class':'btn'}) + if tag: new_soup=self.index_to_soup(tag['href'], raw=True) return new_soup @@ -92,8 +92,7 @@ class gw_wawa(BasicNewsRecipe): rem.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup - + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup diff --git a/recipes/gazeta_pomorska.recipe b/recipes/gazeta_pomorska.recipe index 083f5cbeed..a4dc8ed1ea 100644 --- a/recipes/gazeta_pomorska.recipe +++ b/recipes/gazeta_pomorska.recipe @@ -1,104 +1,96 @@ -#!/usr/bin/env python - -# # Przed uzyciem przeczytaj komentarz w sekcji "feeds" - -__license__ = 'GPL v3' -__copyright__ = u'2010, Richard z forum.eksiazki.org' -'''pomorska.pl''' - import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GazetaPomorska(BasicNewsRecipe): title = u'Gazeta Pomorska' - publisher = u'Gazeta Pomorska' - description = u'Kujawy i Pomorze - wiadomo\u015bci' + __author__ = 'Richard z forum.eksiazki.org, fenuks' + description = u'Gazeta Pomorska - portal regionalny' + category = 'newspaper' language = 'pl' - __author__ = u'Richard z forum.eksiazki.org' - # # (dziekuje t3d z forum.eksiazki.org za testy) - oldest_article = 2 - max_articles_per_feed = 20 + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + INDEX = 'http://www.pomorska.pl' + masthead_url = INDEX + '/images/top_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True no_stylesheets = True - remove_javascript = True - preprocess_regexps = [ - (re.compile(r'<a href="http://maps.google[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''), - (re.compile(r'[<Bb >]*Poznaj opinie[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''), - (re.compile(r'[<Bb >]*Przeczytaj[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''), - (re.compile(r'[<Bb >]*Wi.cej informacji[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''), - (re.compile(r'<a href[^>]*>[<Bb >]*Wideo[^<]*[</Bb >]*[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''), - (re.compile(r'<a href[^>]*>[<Bb >]*KLIKNIJ TUTAJ[^<]*[</Bb >]*[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: '') - ] + ignore_duplicate_articles = {'title', 'url'} - feeds = [ -# # Tutaj jest wymieniona lista kategorii jakie mozemy otrzymywac z Gazety -# # Pomorskiej, po jednej kategorii w wierszu. Jesli na poczatku danego wiersza -# # znajduje sie jeden znak "#", oznacza to ze kategoria jest zakomentowana -# # i nie bedziemy jej otrzymywac. Jesli chcemy ja otrzymywac nalezy usunac -# # znak # z jej wiersza. -# # Jesli subskrybujemy wiecej niz jedna kategorie, na koncu wiersza z kazda -# # kategoria musi sie znajdowac niezakomentowany przecinek, z wyjatkiem -# # ostatniego wiersza - ma byc bez przecinka na koncu. -# # Rekomendowane opcje wyboru kategorii: -# # 1. PomorskaRSS - wiadomosci kazdego typu, lub -# # 2. Region + wybrane miasta, lub -# # 3. Wiadomosci tematyczne. -# # Lista kategorii: + preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')] - # # PomorskaRSS - wiadomosci kazdego typu, zakomentuj znakiem "#" - # # przed odkomentowaniem wiadomosci wybranego typu: - (u'PomorskaRSS', u'http://www.pomorska.pl/rss.xml') - - # # wiadomosci z regionu nie przypisane do okreslonego miasta: - # (u'Region', u'http://www.pomorska.pl/region.xml'), - - # # wiadomosci przypisane do miast: - # (u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'), - # (u'Nak\u0142o', u'http://www.pomorska.pl/naklo.xml'), - # (u'Koronowo', u'http://www.pomorska.pl/koronowo.xml'), - # (u'Solec Kujawski', u'http://www.pomorska.pl/soleckujawski.xml'), - # (u'Grudzi\u0105dz', u'http://www.pomorska.pl/grudziadz.xml'), - # (u'Inowroc\u0142aw', u'http://www.pomorska.pl/inowroclaw.xml'), - # (u'Toru\u0144', u'http://www.pomorska.pl/torun.xml'), - # (u'W\u0142oc\u0142awek', u'http://www.pomorska.pl/wloclawek.xml'), - # (u'Aleksandr\u00f3w Kujawski', u'http://www.pomorska.pl/aleksandrow.xml'), - # (u'Brodnica', u'http://www.pomorska.pl/brodnica.xml'), - # (u'Che\u0142mno', u'http://www.pomorska.pl/chelmno.xml'), - # (u'Chojnice', u'http://www.pomorska.pl/chojnice.xml'), - # (u'Ciechocinek', u'http://www.pomorska.pl/ciechocinek.xml'), - # (u'Golub Dobrzy\u0144', u'http://www.pomorska.pl/golubdobrzyn.xml'), - # (u'Mogilno', u'http://www.pomorska.pl/mogilno.xml'), - # (u'Radziej\u00f3w', u'http://www.pomorska.pl/radziejow.xml'), - # (u'Rypin', u'http://www.pomorska.pl/rypin.xml'), - # (u'S\u0119p\u00f3lno', u'http://www.pomorska.pl/sepolno.xml'), - # (u'\u015awiecie', u'http://www.pomorska.pl/swiecie.xml'), - # (u'Tuchola', u'http://www.pomorska.pl/tuchola.xml'), - # (u'\u017bnin', u'http://www.pomorska.pl/znin.xml') - - # # wiadomosci tematyczne (redundancja z region/miasta): - # (u'Sport', u'http://www.pomorska.pl/sport.xml'), - # (u'Zdrowie', u'http://www.pomorska.pl/zdrowie.xml'), - # (u'Auto', u'http://www.pomorska.pl/moto.xml'), - # (u'Dom', u'http://www.pomorska.pl/dom.xml'), - # (u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'), - # (u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml') - ] + keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] + remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', + 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', + 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), + dict(attrs={'class':'articleFunctions'})] - keep_only_tags = [dict(name='div', attrs={'id':'article'})] + feeds = [(u'Wszystkie', u'http://www.pomorska.pl/rss.xml'), + (u'Region', u'http://www.pomorska.pl/region.xml'), + (u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'), + (u'Nakło', u'http://www.pomorska.pl/naklo.xml'), + (u'Koronowo', u'http://www.pomorska.pl/koronowo.xml'), + (u'Solec Kujawski', u'http://www.pomorska.pl/soleckujawski.xml'), + (u'Grudziądz', u'http://www.pomorska.pl/grudziadz.xml'), + (u'Inowrocław', u'http://www.pomorska.pl/inowroclaw.xml'), + (u'Toruń', u'http://www.pomorska.pl/torun.xml'), + (u'Włocławek', u'http://www.pomorska.pl/wloclawek.xml'), + (u'Aleksandrów Kujawski', u'http://www.pomorska.pl/aleksandrow.xml'), + (u'Brodnica', u'http://www.pomorska.pl/brodnica.xml'), + (u'Chełmno', u'http://www.pomorska.pl/chelmno.xml'), + (u'Chojnice', u'http://www.pomorska.pl/chojnice.xml'), + (u'Ciechocinek', u'http://www.pomorska.pl/ciechocinek.xml'), + (u'Golub-Dobrzyń', u'http://www.pomorska.pl/golubdobrzyn.xml'), + (u'Mogilno', u'http://www.pomorska.pl/mogilno.xml'), + (u'Radziejów', u'http://www.pomorska.pl/radziejow.xml'), + (u'Rypin', u'http://www.pomorska.pl/rypin.xml'), + (u'Sępólno', u'http://www.pomorska.pl/sepolno.xml'), + (u'Świecie', u'http://www.pomorska.pl/swiecie.xml'), + (u'Tuchola', u'http://www.pomorska.pl/tuchola.xml'), + (u'Żnin', u'http://www.pomorska.pl/znin.xml'), + (u'Sport', u'http://www.pomorska.pl/sport.xml'), + (u'Zdrowie', u'http://www.pomorska.pl/zdrowie.xml'), + (u'Auto', u'http://www.pomorska.pl/moto.xml'), + (u'Dom', u'http://www.pomorska.pl/dom.xml'), + #(u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'), + (u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml')] - remove_tags = [ - dict(name='p', attrs={'id':'articleTags'}), - dict(name='div', attrs={'id':'articleEpaper'}), - dict(name='div', attrs={'id':'articleConnections'}), - dict(name='div', attrs={'class':'articleFacts'}), - dict(name='div', attrs={'id':'articleExternalLink'}), - dict(name='div', attrs={'id':'articleMultimedia'}), - dict(name='div', attrs={'id':'articleGalleries'}), - dict(name='div', attrs={'id':'articleAlarm'}), - dict(name='div', attrs={'id':'adholder_srodek1'}), - dict(name='div', attrs={'id':'articleVideo'}), - dict(name='a', attrs={'name':'fb_share'})] + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') + nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] + soup = self.index_to_soup(nexturl) + self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] + return getattr(self, 'cover_url', self.cover_url) - extra_css = '''h1 { font-size: 1.4em; } - h2 { font-size: 1.0em; }''' + def append_page(self, soup, appendtag): + tag = soup.find('span', attrs={'class':'photoNavigationPages'}) + if tag: + number = int(tag.string.rpartition('/')[-1].replace(' ', '')) + baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] + for r in appendtag.findAll(attrs={'class':'photoNavigation'}): + r.extract() + for nr in range(2, number+1): + soup2 = self.index_to_soup(baseurl + str(nr)) + pagetext = soup2.find(id='photoContainer') + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoMeta'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoStoryText'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/gazeta_wroclawska.recipe b/recipes/gazeta_wroclawska.recipe new file mode 100644 index 0000000000..5bcb5654c0 --- /dev/null +++ b/recipes/gazeta_wroclawska.recipe @@ -0,0 +1,34 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class GazetaWroclawska(BasicNewsRecipe): + title = u'Gazeta Wroc\u0142awska' + __author__ = 'fenuks' + description = u'Gazeta Regionalna Gazeta Wrocławska. Najnowsze Wiadomości Wrocław, Informacje Wrocław. Czytaj!' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gazetawroclawska.png?24' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] + remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] + + feeds = [(u'Fakty24', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533775/index.rss?201302'), (u'Region', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_region.xml?201302'), (u'Kultura', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533777/index.rss?201302'), (u'Sport', u'http://gazetawroclawska.feedsportal.com/c/32980/f/533776/index.rss?201302'), (u'Z archiwum', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_zarchiwum.xml?201302'), (u'M\xf3j reporter', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_mojreporter.xml?201302'), (u'Historia', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_historia.xml?201302'), (u'Listy do redakcji', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_listydoredakcji.xml?201302'), (u'Na drogach', u'http://www.gazetawroclawska.pl/rss/gazetawroclawska_nadrogach.xml?201302')] + + def print_version(self, url): + return url.replace('artykul', 'drukuj') + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/gazeta-wroclawska/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/gazeta_wspolczesna.recipe b/recipes/gazeta_wspolczesna.recipe new file mode 100644 index 0000000000..6648d8eb1a --- /dev/null +++ b/recipes/gazeta_wspolczesna.recipe @@ -0,0 +1,68 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +class GazetaWspolczesna(BasicNewsRecipe): + title = u'Gazeta Wsp\xf3\u0142czesna' + __author__ = 'fenuks' + description = u'Gazeta Współczesna - portal regionalny.' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + INDEX = 'http://www.wspolczesna.pl' + masthead_url = INDEX + '/images/top_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} + + preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] + remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', + 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', + 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), + dict(attrs={'class':'articleFunctions'})] + + feeds = [(u'Wszystkie', u'http://www.wspolczesna.pl/rss.xml'), (u'August\xf3w', u'http://www.wspolczesna.pl/augustow.xml'), (u'Bia\u0142ystok', u'http://www.wspolczesna.pl/bialystok.xml'), (u'Bielsk Podlaski', u'http://www.wspolczesna.pl/bielsk.xml'), (u'E\u0142k', u'http://www.wspolczesna.pl/elk.xml'), (u'Grajewo', u'http://www.wspolczesna.pl/grajewo.xml'), (u'Go\u0142dap', u'http://www.wspolczesna.pl/goldap.xml'), (u'Hajn\xf3wka', u'http://www.wspolczesna.pl/hajnowka.xml'), (u'Kolno', u'http://www.wspolczesna.pl/kolno.xml'), (u'\u0141om\u017ca', u'http://www.wspolczesna.pl/lomza.xml'), (u'Mo\u0144ki', u'http://www.wspolczesna.pl/monki.xml'), (u'Olecko', u'http://www.wspolczesna.pl/olecko.xml'), (u'Ostro\u0142\u0119ka', u'http://www.wspolczesna.pl/ostroleka.xml'), (u'Powiat Bia\u0142ostocki', u'http://www.wspolczesna.pl/powiat.xml'), (u'Sejny', u'http://www.wspolczesna.pl/sejny.xml'), (u'Siemiatycze', u'http://www.wspolczesna.pl/siemiatycze.xml'), (u'Sok\xf3\u0142ka', u'http://www.wspolczesna.pl/sokolka.xml'), (u'Suwa\u0142ki', u'http://www.wspolczesna.pl/suwalki.xml'), (u'Wysokie Mazowieckie', u'http://www.wspolczesna.pl/wysokie.xml'), (u'Zambr\xf3w', u'http://www.wspolczesna.pl/zambrow.xml'), (u'Sport', u'http://www.wspolczesna.pl/sport.xml'), (u'Praca', u'http://www.wspolczesna.pl/praca.xml'), (u'Dom', u'http://www.wspolczesna.pl/dom.xml'), (u'Auto', u'http://www.wspolczesna.pl/auto.xml'), (u'Zdrowie', u'http://www.wspolczesna.pl/zdrowie.xml')] + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') + nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] + soup = self.index_to_soup(nexturl) + self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] + return getattr(self, 'cover_url', self.cover_url) + + def append_page(self, soup, appendtag): + tag = soup.find('span', attrs={'class':'photoNavigationPages'}) + if tag: + number = int(tag.string.rpartition('/')[-1].replace(' ', '')) + baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] + + for r in appendtag.findAll(attrs={'class':'photoNavigation'}): + r.extract() + for nr in range(2, number+1): + soup2 = self.index_to_soup(baseurl + str(nr)) + pagetext = soup2.find(id='photoContainer') + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoMeta'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoStoryText'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 633b80444a..c415edc9d0 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.ebooks.BeautifulSoup import Comment class Gazeta_Wyborcza(BasicNewsRecipe): title = u'Gazeta.pl' __author__ = 'fenuks, Artur Stachecki' language = 'pl' - description = 'news from gazeta.pl' + description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.' category = 'newspaper' publication_type = 'newspaper' masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' @@ -16,6 +16,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe): max_articles_per_feed = 100 remove_javascript = True no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} remove_tags_before = dict(id='k0') remove_tags_after = dict(id='banP4') remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] @@ -48,6 +49,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe): url = self.INDEX + link['href'] soup2 = self.index_to_soup(url) pagetext = soup2.find(id='artykul') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) tag = soup2.find('div', attrs={'id': 'Str'}) @@ -65,6 +69,9 @@ class Gazeta_Wyborcza(BasicNewsRecipe): nexturl = pagetext.find(id='gal_btn_next') if nexturl: nexturl = nexturl.a['href'] + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) rem = appendtag.find(id='gal_navi') @@ -105,3 +112,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe): soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href']) self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url) + + '''def image_url_processor(self, baseurl, url): + print "@@@@@@@@", url + return url.replace('http://wyborcza.pl/ ', '')''' diff --git a/recipes/gcn.recipe b/recipes/gcn.recipe new file mode 100644 index 0000000000..5f403bfd73 --- /dev/null +++ b/recipes/gcn.recipe @@ -0,0 +1,88 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +class GCN(BasicNewsRecipe): + title = u'Gazeta Codziennej Nowiny' + __author__ = 'fenuks' + description = u'nowiny24.pl - portal regionalny województwa podkarpackiego.' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + INDEX = 'http://www.nowiny24.pl' + masthead_url = INDEX + '/images/top_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['style'] + preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] + remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', + 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', + 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), + dict(attrs={'class':'articleFunctions'})] + + feeds = [(u'Wszystkie', u'http://www.nowiny24.pl/rss.xml'), + (u'Podkarpacie', u'http://www.nowiny24.pl/podkarpacie.xml'), + (u'Bieszczady', u'http://www.nowiny24.pl/bieszczady.xml'), + (u'Rzeszów', u'http://www.nowiny24.pl/rzeszow.xml'), + (u'Przemyśl', u'http://www.nowiny24.pl/przemysl.xml'), + (u'Leżajsk', u'http://www.nowiny24.pl/lezajsk.xml'), + (u'Łańcut', u'http://www.nowiny24.pl/lancut.xml'), + (u'Dębica', u'http://www.nowiny24.pl/debica.xml'), + (u'Jarosław', u'http://www.nowiny24.pl/jaroslaw.xml'), + (u'Krosno', u'http://www.nowiny24.pl/krosno.xml'), + (u'Mielec', u'http://www.nowiny24.pl/mielec.xml'), + (u'Nisko', u'http://www.nowiny24.pl/nisko.xml'), + (u'Sanok', u'http://www.nowiny24.pl/sanok.xml'), + (u'Stalowa Wola', u'http://www.nowiny24.pl/stalowawola.xml'), + (u'Tarnobrzeg', u'http://www.nowiny24.pl/tarnobrzeg.xml'), + (u'Sport', u'http://www.nowiny24.pl/sport.xml'), + (u'Dom', u'http://www.nowiny24.pl/dom.xml'), + (u'Auto', u'http://www.nowiny24.pl/auto.xml'), + (u'Praca', u'http://www.nowiny24.pl/praca.xml'), + (u'Zdrowie', u'http://www.nowiny24.pl/zdrowie.xml'), + (u'Wywiady', u'http://www.nowiny24.pl/wywiady.xml')] + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') + nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] + soup = self.index_to_soup(nexturl) + self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] + return getattr(self, 'cover_url', self.cover_url) + + def append_page(self, soup, appendtag): + tag = soup.find('span', attrs={'class':'photoNavigationPages'}) + if tag: + number = int(tag.string.rpartition('/')[-1].replace(' ', '')) + baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] + + for r in appendtag.findAll(attrs={'class':'photoNavigation'}): + r.extract() + for nr in range(2, number+1): + soup2 = self.index_to_soup(baseurl + str(nr)) + pagetext = soup2.find(id='photoContainer') + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoMeta'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoStoryText'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/geopolityka.recipe b/recipes/geopolityka.recipe new file mode 100644 index 0000000000..9749007479 --- /dev/null +++ b/recipes/geopolityka.recipe @@ -0,0 +1,12 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class BasicUserRecipe1361379046(BasicNewsRecipe): + title = u'Geopolityka.org' + language = 'pl' + __author__ = 'chemik111' + oldest_article = 15 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Rss', u'http://geopolityka.org/index.php?format=feed&type=rss')] + diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index 525cf6c605..6d3528b0bc 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -11,12 +11,13 @@ class Gildia(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - remove_empty_feeds=True - no_stylesheets=True + remove_empty_feeds = True + no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} preprocess_regexps = [(re.compile(ur'</?sup>'), lambda match: '') ] - remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})] - keep_only_tags=dict(name='div', attrs={'class':'widetext'}) + ignore_duplicate_articles = {'title', 'url'} + remove_tags = [dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})] + keep_only_tags = dict(name='div', attrs={'class':'widetext'}) feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss')] @@ -34,7 +35,7 @@ class Gildia(BasicNewsRecipe): def preprocess_html(self, soup): for a in soup('a'): - if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: + if a.has_key('href') and not a['href'].startswith('http'): if '/gry/' in a['href']: a['href']='http://www.gry.gildia.pl' + a['href'] elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower(): diff --git a/recipes/glos_wielkopolski.recipe b/recipes/glos_wielkopolski.recipe new file mode 100644 index 0000000000..d7706c4173 --- /dev/null +++ b/recipes/glos_wielkopolski.recipe @@ -0,0 +1,34 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class GlosWielkopolski(BasicNewsRecipe): + title = u'G\u0142os Wielkopolski' + __author__ = 'fenuks' + description = u'Gazeta Regionalna Głos Wielkopolski. Najnowsze Wiadomości Poznań. Czytaj Informacje Poznań!' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/gloswielkopolski.png?24' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds= True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] + remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] + + feeds = [(u'Wszystkie', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533779/index.rss?201302'), (u'Wiadomo\u015bci', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533780/index.rss?201302'), (u'Sport', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533781/index.rss?201302'), (u'Kultura', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533782/index.rss?201302'), (u'Porady', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_porady.xml?201302'), (u'Blogi', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_blogi.xml?201302'), (u'Nasze akcje', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_naszeakcje.xml?201302'), (u'Opinie', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_opinie.xml?201302'), (u'Magazyn', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_magazyn.xml?201302')] + + def print_version(self, url): + return url.replace('artykul', 'drukuj') + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/glos-wielkopolski/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/gosc_niedzielny.recipe b/recipes/gosc_niedzielny.recipe index 59c8fc2f26..11beb076f5 100644 --- a/recipes/gosc_niedzielny.recipe +++ b/recipes/gosc_niedzielny.recipe @@ -2,7 +2,8 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com' +__copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \ + 2013, Tomasz Długosz, tomek3d@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile @@ -12,9 +13,9 @@ import re class GN(BasicNewsRecipe): EDITION = 0 - __author__ = 'Piotr Kontek' - title = u'Gość niedzielny' - description = 'Weekly magazine' + __author__ = 'Piotr Kontek, Tomasz Długosz' + title = u'Gość Niedzielny' + description = 'Ogólnopolski tygodnik katolicki' encoding = 'utf-8' no_stylesheets = True language = 'pl' @@ -38,17 +39,25 @@ class GN(BasicNewsRecipe): first = True for p in main_section.findAll('p', attrs={'class':None}, recursive=False): if first and p.find('img') != None: - article = article + '<p>' - article = article + str(p.find('img')).replace('src="/files/','src="http://www.gosc.pl/files/') - article = article + '<font size="-2">' + article += '<p>' + article += str(p.find('img')).replace('src="/files/','src="http://www.gosc.pl/files/') + article += '<font size="-2">' for s in p.findAll('span'): - article = article + self.tag_to_string(s) - article = article + '</font></p>' + article += self.tag_to_string(s) + article += '</font></p>' else: - article = article + str(p).replace('src="/files/','src="http://www.gosc.pl/files/') + article += str(p).replace('src="/files/','src="http://www.gosc.pl/files/') first = False + limiter = main_section.find('p', attrs={'class' : 'limiter'}) + if limiter: + article += str(limiter) - html = unicode(title) + unicode(authors) + unicode(article) + html = unicode(title) + #sometimes authors are not filled in: + if authors: + html += unicode(authors) + unicode(article) + else: + html += unicode(article) self.temp_files.append(PersistentTemporaryFile('_temparse.html')) self.temp_files[-1].write(html) @@ -65,7 +74,8 @@ class GN(BasicNewsRecipe): if img != None: a = img.parent self.EDITION = a['href'] - self.title = img['alt'] + #this was preventing kindles from moving old issues to 'Back Issues' category: + #self.title = img['alt'] self.cover_url = 'http://www.gosc.pl' + img['src'] if year != date.today().year or not first: break diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index 3852f65d32..baaac85492 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup + class Gram_pl(BasicNewsRecipe): title = u'Gram.pl' __author__ = 'fenuks' @@ -11,15 +12,14 @@ class Gram_pl(BasicNewsRecipe): max_articles_per_feed = 100 ignore_duplicate_articles = {'title', 'url'} no_stylesheets= True + remove_empty_feeds = True #extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' keep_only_tags= [dict(id='articleModule')] - remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter']})] + remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']})] feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), - (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'), - (u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'), - #(u'Kolektyw- Moto Games', u'http://www.motogames.gram.pl/news.rss') - ] + (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles') + ] def parse_feeds (self): feeds = BasicNewsRecipe.parse_feeds(self) diff --git a/recipes/gry_online_pl.recipe b/recipes/gry_online_pl.recipe index 4b9282bdd3..2876a9b4e8 100644 --- a/recipes/gry_online_pl.recipe +++ b/recipes/gry_online_pl.recipe @@ -1,20 +1,24 @@ +import time from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class GryOnlinePl(BasicNewsRecipe): title = u'Gry-Online.pl' __author__ = 'fenuks' - description = 'Gry-Online.pl - computer games' + description = u'Wiadomości o grach, recenzje, zapowiedzi. Encyklopedia Gier zawiera opisy gier na PC, konsole Xbox360, PS3 i inne platformy.' category = 'games' language = 'pl' oldest_article = 13 - INDEX= 'http://www.gry-online.pl/' - masthead_url='http://www.gry-online.pl/im/gry-online-logo.png' - cover_url='http://www.gry-online.pl/im/gry-online-logo.png' + INDEX = 'http://www.gry-online.pl/' + masthead_url = 'http://www.gry-online.pl/im/gry-online-logo.png' + cover_url = 'http://www.gry-online.pl/im/gry-online-logo.png' max_articles_per_feed = 100 - no_stylesheets= True - keep_only_tags=[dict(name='div', attrs={'class':['gc660', 'gc660 S013']})] - remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] - feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] + no_stylesheets = True + keep_only_tags = [dict(name='div', attrs={'class':['gc660', 'gc660 S013', 'news_endpage_tit', 'news_container', 'news']})] + remove_tags = [dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})] + feeds = [ + (u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), + ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] def append_page(self, soup, appendtag): @@ -24,17 +28,69 @@ class GryOnlinePl(BasicNewsRecipe): url_part = soup.find('link', attrs={'rel':'canonical'})['href'] url_part = url_part[25:].rpartition('?')[0] for nexturl in nexturls[1:-1]: - soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href']) + finalurl = 'http://www.gry-online.pl/' + url_part + nexturl['href'] + for i in range(10): + try: + soup2 = self.index_to_soup(finalurl) + break + except: + print 'retrying in 0.5s' + time.sleep(0.5) pagetext = soup2.find(attrs={'class':'gc660'}) for r in pagetext.findAll(name='header'): r.extract() for r in pagetext.findAll(attrs={'itemprop':'description'}): r.extract() + pos = len(appendtag.contents) appendtag.insert(pos, pagetext) for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}): r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + else: + tag = appendtag.find('div', attrs={'class':'S018stronyr'}) + if tag: + nexturl = tag.a + url_part = soup.find('link', attrs={'rel':'canonical'})['href'] + url_part = url_part[25:].rpartition('?')[0] + while tag: + end = tag.find(attrs={'class':'right left-dead'}) + if end: + break + else: + nexturl = tag.a + finalurl = 'http://www.gry-online.pl/' + url_part + nexturl['href'] + for i in range(10): + try: + soup2 = self.index_to_soup(finalurl) + break + except: + print 'retrying in 0.5s' + time.sleep(0.5) + tag = soup2.find('div', attrs={'class':'S018stronyr'}) + pagetext = soup2.find(attrs={'class':'gc660'}) + for r in pagetext.findAll(name='header'): + r.extract() + for r in pagetext.findAll(attrs={'itemprop':'description'}): + r.extract() + + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + [comment.extract() for comment in comments] + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony']}): + r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + def image_url_processor(self, baseurl, url): + if url.startswith('..'): + return url[2:] + else: + return url def preprocess_html(self, soup): self.append_page(soup, soup.body) diff --git a/recipes/harpers_full.recipe b/recipes/harpers_full.recipe index a63f828968..c206c7a064 100644 --- a/recipes/harpers_full.recipe +++ b/recipes/harpers_full.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>' +__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>' ''' harpers.org - paid subscription/ printed issue articles This recipe only get's article's published in text format @@ -72,7 +72,8 @@ class Harpers_full(BasicNewsRecipe): #go to the current issue soup1 = self.index_to_soup(currentIssue_url) - date = re.split('\s\|\s',self.tag_to_string(soup1.head.title.string))[0] + currentIssue_title = self.tag_to_string(soup1.head.title.string) + date = re.split('\s\|\s',currentIssue_title)[0] self.timefmt = u' [%s]'%date #get cover @@ -84,27 +85,23 @@ class Harpers_full(BasicNewsRecipe): count = 0 for item in soup1.findAll('div', attrs={'class':'articleData'}): text_links = item.findAll('h2') - for text_link in text_links: - if count == 0: - count = 1 - else: - url = text_link.a['href'] - title = text_link.a.contents[0] - date = strftime(' %B %Y') - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':'' - }) - return [(soup1.head.title.string, articles)] + if text_links: + for text_link in text_links: + if count == 0: + count = 1 + else: + url = text_link.a['href'] + title = self.tag_to_string(text_link.a) + date = strftime(' %B %Y') + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':'' + }) + return [(currentIssue_title, articles)] def print_version(self, url): return url + '?single=1' - def cleanup(self): - soup = self.index_to_soup('http://harpers.org/') - signouturl=self.tag_to_string(soup.find('li', attrs={'class':'subLogOut'}).findNext('li').a['href']) - self.log(signouturl) - self.browser.open(signouturl) diff --git a/recipes/hatalska.recipe b/recipes/hatalska.recipe new file mode 100644 index 0000000000..22ee77b31a --- /dev/null +++ b/recipes/hatalska.recipe @@ -0,0 +1,27 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = 'teepel 2012' + +''' +hatalska.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class hatalska(BasicNewsRecipe): + title = u'Hatalska' + __author__ = 'teepel <teepel44@gmail.com>' + language = 'pl' + description = u'Blog specjalistki z branży mediowo-reklamowej - Natalii Hatalskiej' + oldest_article = 7 + masthead_url='http://hatalska.com/wp-content/themes/jamel/images/logo.png' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript=True + no_stylesheets=True + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'feedflare'})) + + feeds = [(u'Blog', u'http://feeds.feedburner.com/hatalskacom')] diff --git a/recipes/hindu.recipe b/recipes/hindu.recipe index eb84fc4031..74c0d6539a 100644 --- a/recipes/hindu.recipe +++ b/recipes/hindu.recipe @@ -41,13 +41,16 @@ class TheHindu(BasicNewsRecipe): if current_section and x.get('class', '') == 'tpaper': a = x.find('a', href=True) if a is not None: + title = self.tag_to_string(a) + self.log('\tFound article:', title) current_articles.append({'url':a['href']+'?css=print', - 'title':self.tag_to_string(a), 'date': '', + 'title':title, 'date': '', 'description':''}) if x.name == 'h3': if current_section and current_articles: feeds.append((current_section, current_articles)) current_section = self.tag_to_string(x) + self.log('Found section:', current_section) current_articles = [] return feeds diff --git a/recipes/hnonline.recipe b/recipes/hnonline.recipe new file mode 100644 index 0000000000..5f6a48b026 --- /dev/null +++ b/recipes/hnonline.recipe @@ -0,0 +1,67 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class HNonlineRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'lacike' + language = 'sk' + version = 1 + + title = u'HNonline' + publisher = u'HNonline' + category = u'News, Newspaper' + description = u'News from Slovakia' + cover_url = u'http://hnonline.sk/img/sk/_relaunch/logo2.png' + + oldest_article = 1 + max_articles_per_feed = 100 + use_embedded_content = False + remove_empty_feeds = True + + no_stylesheets = True + remove_javascript = True + + # Feeds from: http://rss.hnonline.sk, for listing see http://rss.hnonline.sk/prehlad + feeds = [] + feeds.append((u'HNonline|Ekonomika a firmy', u'http://rss.hnonline.sk/?p=kC1000')) + feeds.append((u'HNonline|Slovensko', u'http://rss.hnonline.sk/?p=kC2000')) + feeds.append((u'HNonline|Svet', u'http://rss.hnonline.sk/?p=kC3000')) + feeds.append((u'HNonline|\u0160port', u'http://rss.hnonline.sk/?p=kC4000')) + feeds.append((u'HNonline|Online rozhovor', u'http://rss.hnonline.sk/?p=kCR000')) + + feeds.append((u'FinWeb|Spr\u00E1vy zo sveta financi\u00ED', u'http://rss.finweb.hnonline.sk/spravodajstvo')) + feeds.append((u'FinWeb|Koment\u00E1re a anal\u00FDzy', u'http://rss.finweb.hnonline.sk/?p=kPC200')) + feeds.append((u'FinWeb|Invest\u00EDcie', u'http://rss.finweb.hnonline.sk/?p=kPC300')) + feeds.append((u'FinWeb|Svet akci\u00ED', u'http://rss.finweb.hnonline.sk/?p=kPC400')) + feeds.append((u'FinWeb|Rozhovory', u'http://rss.finweb.hnonline.sk/?p=kPC500')) + feeds.append((u'FinWeb|T\u00E9ma t\u00FD\u017Ed\u0148a', u'http://rss.finweb.hnonline.sk/?p=kPC600')) + feeds.append((u'FinWeb|Rebr\u00ED\u010Dky', u'http://rss.finweb.hnonline.sk/?p=kPC700')) + + feeds.append((u'HNstyle|Kult\u00FAra', u'http://style.hnonline.sk/?p=kTC100')) + feeds.append((u'HNstyle|Auto-moto', u'http://style.hnonline.sk/?p=kTC200')) + feeds.append((u'HNstyle|Digit\u00E1l', u'http://style.hnonline.sk/?p=kTC300')) + feeds.append((u'HNstyle|Veda', u'http://style.hnonline.sk/?p=kTCV00')) + feeds.append((u'HNstyle|Dizajn', u'http://style.hnonline.sk/?p=kTC400')) + feeds.append((u'HNstyle|Cestovanie', u'http://style.hnonline.sk/?p=kTCc00')) + feeds.append((u'HNstyle|V\u00EDkend', u'http://style.hnonline.sk/?p=kTC800')) + feeds.append((u'HNstyle|Gastro', u'http://style.hnonline.sk/?p=kTC600')) + feeds.append((u'HNstyle|M\u00F3da', u'http://style.hnonline.sk/?p=kTC700')) + feeds.append((u'HNstyle|Modern\u00E1 \u017Eena', u'http://style.hnonline.sk/?p=kTCA00')) + feeds.append((u'HNstyle|Pre\u010Do nie?!', u'http://style.hnonline.sk/?p=k7C000')) + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'h1', attrs = {'class': 'detail-titulek'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-podtitulek'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-perex'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-text'})) + + remove_tags = [] + #remove_tags.append(dict(name = 'div', attrs = {'id': re.compile('smeplayer.*')})) + + remove_tags_after = [] + #remove_tags_after = [dict(name = 'p', attrs = {'class': 'autor_line'})] + + extra_css = ''' + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/LiberationSans.ttf)} + body {font-family: sans1, serif1;} + ''' diff --git a/recipes/icons/astronomia_pl.png b/recipes/icons/astronomia_pl.png index b854091853..4cc1eea230 100644 Binary files a/recipes/icons/astronomia_pl.png and b/recipes/icons/astronomia_pl.png differ diff --git a/recipes/icons/bachormagazyn.png b/recipes/icons/bachormagazyn.png new file mode 100644 index 0000000000..c379b51ae7 Binary files /dev/null and b/recipes/icons/bachormagazyn.png differ diff --git a/recipes/icons/badania_net.png b/recipes/icons/badania_net.png new file mode 100644 index 0000000000..de915de8d1 Binary files /dev/null and b/recipes/icons/badania_net.png differ diff --git a/recipes/icons/bash_org_pl.png b/recipes/icons/bash_org_pl.png index 5fc18a38e0..abda46c18b 100644 Binary files a/recipes/icons/bash_org_pl.png and b/recipes/icons/bash_org_pl.png differ diff --git a/recipes/icons/biweekly.png b/recipes/icons/biweekly.png new file mode 100644 index 0000000000..00356e091d Binary files /dev/null and b/recipes/icons/biweekly.png differ diff --git a/recipes/icons/blog_biszopa.png b/recipes/icons/blog_biszopa.png new file mode 100644 index 0000000000..eaba074cde Binary files /dev/null and b/recipes/icons/blog_biszopa.png differ diff --git a/recipes/icons/cgm_pl.png b/recipes/icons/cgm_pl.png index 0e332f720e..038b475312 100644 Binary files a/recipes/icons/cgm_pl.png and b/recipes/icons/cgm_pl.png differ diff --git a/recipes/icons/czas_gentlemanow.png b/recipes/icons/czas_gentlemanow.png index 0d20f80c05..0e76cb7e4c 100644 Binary files a/recipes/icons/czas_gentlemanow.png and b/recipes/icons/czas_gentlemanow.png differ diff --git a/recipes/icons/dwutygodnik.png b/recipes/icons/dwutygodnik.png new file mode 100644 index 0000000000..00356e091d Binary files /dev/null and b/recipes/icons/dwutygodnik.png differ diff --git a/recipes/icons/dziennik_baltycki.png b/recipes/icons/dziennik_baltycki.png new file mode 100644 index 0000000000..6572da8a34 Binary files /dev/null and b/recipes/icons/dziennik_baltycki.png differ diff --git a/recipes/icons/dziennik_lodzki.png b/recipes/icons/dziennik_lodzki.png new file mode 100644 index 0000000000..566fe3df86 Binary files /dev/null and b/recipes/icons/dziennik_lodzki.png differ diff --git a/recipes/icons/dziennik_pl.png b/recipes/icons/dziennik_pl.png index 066f3d215b..3b9a6beae5 100644 Binary files a/recipes/icons/dziennik_pl.png and b/recipes/icons/dziennik_pl.png differ diff --git a/recipes/icons/dziennik_wschodni.png b/recipes/icons/dziennik_wschodni.png new file mode 100644 index 0000000000..10c0b4fb58 Binary files /dev/null and b/recipes/icons/dziennik_wschodni.png differ diff --git a/recipes/icons/dziennik_zachodni.png b/recipes/icons/dziennik_zachodni.png new file mode 100644 index 0000000000..1df89a85d3 Binary files /dev/null and b/recipes/icons/dziennik_zachodni.png differ diff --git a/recipes/icons/echo_dnia.png b/recipes/icons/echo_dnia.png new file mode 100644 index 0000000000..a7454b9964 Binary files /dev/null and b/recipes/icons/echo_dnia.png differ diff --git a/recipes/icons/eksiazki.png b/recipes/icons/eksiazki.png index 6f4f18d991..f17e877f09 100644 Binary files a/recipes/icons/eksiazki.png and b/recipes/icons/eksiazki.png differ diff --git a/recipes/icons/elguardian.png b/recipes/icons/elguardian.png new file mode 100644 index 0000000000..a54b067ee4 Binary files /dev/null and b/recipes/icons/elguardian.png differ diff --git a/recipes/icons/emuzica_pl.png b/recipes/icons/emuzica_pl.png new file mode 100644 index 0000000000..f708208bd2 Binary files /dev/null and b/recipes/icons/emuzica_pl.png differ diff --git a/recipes/icons/esenja.png b/recipes/icons/esenja.png new file mode 100644 index 0000000000..185e46ea95 Binary files /dev/null and b/recipes/icons/esenja.png differ diff --git a/recipes/icons/esensja_(rss).png b/recipes/icons/esensja_(rss).png new file mode 100644 index 0000000000..185e46ea95 Binary files /dev/null and b/recipes/icons/esensja_(rss).png differ diff --git a/recipes/icons/eso_pl.png b/recipes/icons/eso_pl.png new file mode 100644 index 0000000000..5f6e18e98e Binary files /dev/null and b/recipes/icons/eso_pl.png differ diff --git a/recipes/icons/film_org_pl.png b/recipes/icons/film_org_pl.png new file mode 100644 index 0000000000..c1c26087dc Binary files /dev/null and b/recipes/icons/film_org_pl.png differ diff --git a/recipes/icons/film_web.png b/recipes/icons/film_web.png index 3ddcdf1cde..260006784a 100644 Binary files a/recipes/icons/film_web.png and b/recipes/icons/film_web.png differ diff --git a/recipes/icons/financial_times_us.png b/recipes/icons/financial_times_us.png new file mode 100644 index 0000000000..2a769d9dbb Binary files /dev/null and b/recipes/icons/financial_times_us.png differ diff --git a/recipes/icons/gameplay_pl.png b/recipes/icons/gameplay_pl.png index 1b7081f393..4ee734aa65 100644 Binary files a/recipes/icons/gameplay_pl.png and b/recipes/icons/gameplay_pl.png differ diff --git a/recipes/icons/gazeta_krakowska.png b/recipes/icons/gazeta_krakowska.png new file mode 100644 index 0000000000..5249702362 Binary files /dev/null and b/recipes/icons/gazeta_krakowska.png differ diff --git a/recipes/icons/gazeta_lubuska.png b/recipes/icons/gazeta_lubuska.png new file mode 100644 index 0000000000..6f2c30992e Binary files /dev/null and b/recipes/icons/gazeta_lubuska.png differ diff --git a/recipes/icons/gazeta_wroclawska.png b/recipes/icons/gazeta_wroclawska.png new file mode 100644 index 0000000000..6003bd2823 Binary files /dev/null and b/recipes/icons/gazeta_wroclawska.png differ diff --git a/recipes/icons/gazeta_wspolczesna.png b/recipes/icons/gazeta_wspolczesna.png new file mode 100644 index 0000000000..c21b6fa414 Binary files /dev/null and b/recipes/icons/gazeta_wspolczesna.png differ diff --git a/recipes/icons/gcn.png b/recipes/icons/gcn.png new file mode 100644 index 0000000000..2874885904 Binary files /dev/null and b/recipes/icons/gcn.png differ diff --git a/recipes/icons/gildia_pl.png b/recipes/icons/gildia_pl.png new file mode 100644 index 0000000000..47fec1bbe8 Binary files /dev/null and b/recipes/icons/gildia_pl.png differ diff --git a/recipes/icons/glos_wielkopolski.png b/recipes/icons/glos_wielkopolski.png new file mode 100644 index 0000000000..fc9726b094 Binary files /dev/null and b/recipes/icons/glos_wielkopolski.png differ diff --git a/recipes/icons/hatalska.png b/recipes/icons/hatalska.png new file mode 100644 index 0000000000..f6d20feb34 Binary files /dev/null and b/recipes/icons/hatalska.png differ diff --git a/recipes/icons/historia_pl.png b/recipes/icons/historia_pl.png index f7774e3139..e035cadc38 100644 Binary files a/recipes/icons/historia_pl.png and b/recipes/icons/historia_pl.png differ diff --git a/recipes/icons/hnonline.png b/recipes/icons/hnonline.png new file mode 100644 index 0000000000..1e073839ad Binary files /dev/null and b/recipes/icons/hnonline.png differ diff --git a/recipes/icons/in4_pl.png b/recipes/icons/in4_pl.png index b3351629f0..c5595fda27 100644 Binary files a/recipes/icons/in4_pl.png and b/recipes/icons/in4_pl.png differ diff --git a/recipes/icons/kresy_pl.png b/recipes/icons/kresy_pl.png index db8ef4efec..47a9537428 100644 Binary files a/recipes/icons/kresy_pl.png and b/recipes/icons/kresy_pl.png differ diff --git a/recipes/icons/ksiazka_pl.png b/recipes/icons/ksiazka_pl.png new file mode 100644 index 0000000000..4beab227dc Binary files /dev/null and b/recipes/icons/ksiazka_pl.png differ diff --git a/recipes/icons/kurier_galicyjski.png b/recipes/icons/kurier_galicyjski.png new file mode 100644 index 0000000000..4d66a15122 Binary files /dev/null and b/recipes/icons/kurier_galicyjski.png differ diff --git a/recipes/icons/kurier_lubelski.png b/recipes/icons/kurier_lubelski.png new file mode 100644 index 0000000000..a7d1a69752 Binary files /dev/null and b/recipes/icons/kurier_lubelski.png differ diff --git a/recipes/icons/kurier_poranny.png b/recipes/icons/kurier_poranny.png new file mode 100644 index 0000000000..9c2742a1eb Binary files /dev/null and b/recipes/icons/kurier_poranny.png differ diff --git a/recipes/icons/kurier_szczecinski.png b/recipes/icons/kurier_szczecinski.png new file mode 100644 index 0000000000..06aadc5529 Binary files /dev/null and b/recipes/icons/kurier_szczecinski.png differ diff --git a/recipes/icons/lifehacker_pl.png b/recipes/icons/lifehacker_pl.png new file mode 100644 index 0000000000..2019e91691 Binary files /dev/null and b/recipes/icons/lifehacker_pl.png differ diff --git a/recipes/icons/lomza.png b/recipes/icons/lomza.png index d7ee0a82ef..9ee73fbd5f 100644 Binary files a/recipes/icons/lomza.png and b/recipes/icons/lomza.png differ diff --git a/recipes/icons/mlody_technik_pl.png b/recipes/icons/mlody_technik_pl.png index 9529ff0511..33af2c3200 100644 Binary files a/recipes/icons/mlody_technik_pl.png and b/recipes/icons/mlody_technik_pl.png differ diff --git a/recipes/icons/money_pl.png b/recipes/icons/money_pl.png new file mode 100644 index 0000000000..7a6958856e Binary files /dev/null and b/recipes/icons/money_pl.png differ diff --git a/recipes/icons/more_intelligent_life.png b/recipes/icons/more_intelligent_life.png new file mode 100644 index 0000000000..4fcf66e9a1 Binary files /dev/null and b/recipes/icons/more_intelligent_life.png differ diff --git a/recipes/icons/national_geographic_pl.png b/recipes/icons/national_geographic_pl.png index 1ccae70dd6..f44be209d9 100644 Binary files a/recipes/icons/national_geographic_pl.png and b/recipes/icons/national_geographic_pl.png differ diff --git a/recipes/icons/nauka_w_polsce.png b/recipes/icons/nauka_w_polsce.png new file mode 100644 index 0000000000..0d872ce682 Binary files /dev/null and b/recipes/icons/nauka_w_polsce.png differ diff --git a/recipes/icons/nezavisne_novine.png b/recipes/icons/nezavisne_novine.png new file mode 100644 index 0000000000..29da3de24f Binary files /dev/null and b/recipes/icons/nezavisne_novine.png differ diff --git a/recipes/icons/nowa_fantastyka.png b/recipes/icons/nowa_fantastyka.png new file mode 100644 index 0000000000..5c71a5c892 Binary files /dev/null and b/recipes/icons/nowa_fantastyka.png differ diff --git a/recipes/icons/nowy_obywatel.png b/recipes/icons/nowy_obywatel.png new file mode 100644 index 0000000000..f41e35365f Binary files /dev/null and b/recipes/icons/nowy_obywatel.png differ diff --git a/recipes/icons/nto.png b/recipes/icons/nto.png new file mode 100644 index 0000000000..0e94a8d54f Binary files /dev/null and b/recipes/icons/nto.png differ diff --git a/recipes/icons/oclab_pl.png b/recipes/icons/oclab_pl.png index 45ecd2533e..2a5bd473f9 100644 Binary files a/recipes/icons/oclab_pl.png and b/recipes/icons/oclab_pl.png differ diff --git a/recipes/icons/osworld_pl.png b/recipes/icons/osworld_pl.png new file mode 100644 index 0000000000..97a7d0dd55 Binary files /dev/null and b/recipes/icons/osworld_pl.png differ diff --git a/recipes/icons/pc_centre_pl.png b/recipes/icons/pc_centre_pl.png index e2fbf1eefb..d7aa6f7776 100644 Binary files a/recipes/icons/pc_centre_pl.png and b/recipes/icons/pc_centre_pl.png differ diff --git a/recipes/icons/pravda_rs.png b/recipes/icons/pravda_rs.png new file mode 100644 index 0000000000..8c4533a79d Binary files /dev/null and b/recipes/icons/pravda_rs.png differ diff --git a/recipes/icons/spiders_web_pl.png b/recipes/icons/spiders_web_pl.png index 499dd19c8f..11b5ee8b08 100644 Binary files a/recipes/icons/spiders_web_pl.png and b/recipes/icons/spiders_web_pl.png differ diff --git a/recipes/icons/tablety_pl.png b/recipes/icons/tablety_pl.png new file mode 100644 index 0000000000..0c3a25813a Binary files /dev/null and b/recipes/icons/tablety_pl.png differ diff --git a/recipes/icons/trojmiasto_pl.png b/recipes/icons/trojmiasto_pl.png new file mode 100644 index 0000000000..665a62b0b5 Binary files /dev/null and b/recipes/icons/trojmiasto_pl.png differ diff --git a/recipes/icons/tvn24.png b/recipes/icons/tvn24.png index 864a6624ac..152b15af87 100644 Binary files a/recipes/icons/tvn24.png and b/recipes/icons/tvn24.png differ diff --git a/recipes/icons/ubuntu_pomoc_org.png b/recipes/icons/ubuntu_pomoc_org.png new file mode 100644 index 0000000000..a143846630 Binary files /dev/null and b/recipes/icons/ubuntu_pomoc_org.png differ diff --git a/recipes/icons/websecurity_pl.png b/recipes/icons/websecurity_pl.png new file mode 100644 index 0000000000..32eff82072 Binary files /dev/null and b/recipes/icons/websecurity_pl.png differ diff --git a/recipes/icons/wprost.png b/recipes/icons/wprost.png index f81878f2d2..d844978175 100644 Binary files a/recipes/icons/wprost.png and b/recipes/icons/wprost.png differ diff --git a/recipes/icons/wprost_rss.png b/recipes/icons/wprost_rss.png new file mode 100644 index 0000000000..d844978175 Binary files /dev/null and b/recipes/icons/wprost_rss.png differ diff --git a/recipes/icons/zycie_warszawy.png b/recipes/icons/zycie_warszawy.png new file mode 100644 index 0000000000..a13d1a018c Binary files /dev/null and b/recipes/icons/zycie_warszawy.png differ diff --git a/recipes/il_giornale.recipe b/recipes/il_giornale.recipe index 007432ed88..6d3eaa5fef 100644 --- a/recipes/il_giornale.recipe +++ b/recipes/il_giornale.recipe @@ -7,7 +7,6 @@ description = 'Italian daily newspaper - 09-11-2011' ''' http://www.ilgiornale.it/ ''' -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe class IlGiornale(BasicNewsRecipe): @@ -25,35 +24,39 @@ class IlGiornale(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 use_embedded_content = False + #auto_cleanup = True + #auto_cleanup_keep = '//div[@id="insertbox_text"]' no_stylesheets = True conversion_options = {'linearize_tables':True} remove_javascript = True + keep_only_tags = [dict(name='h1', attrs={'class':'entry-title'}), dict(name='div', attrs={'id':'insertbox_text'})] - def get_article_url(self, article): - return article.get('guid', article.get('id', None)) - def print_version(self, url): - raw = self.browser.open(url).read() - soup = BeautifulSoup(raw.decode('utf8', 'replace')) - all_print_tags = soup.find('div', {'id':'print_article'}) - print_link = all_print_tags.a - if print_link is None: - return url - return 'http://www.ilgiornale.it' + print_link['href'] + #def get_article_url(self, article): + #return article.get('guid', article.get('id', None)) + + #def print_version(self, url): + #raw = self.browser.open(url).read() + #soup = BeautifulSoup(raw.decode('utf8', 'replace')) + #all_print_tags = soup.find('div', {'id':'print_article'}) + #print_link = all_print_tags.a + #if print_link is None: + #return url + #return 'http://www.ilgiornale.it' + print_link['href'] feeds = [ - (u'Ultime Notizie',u'http://www.ilgiornale.it/?RSS=S'), - (u'All\'Interno', u'http://www.ilgiornale.it/la_s.pic1?SID=8&RSS=S'), - (u'Esteri', u'http://www.ilgiornale.it/la_s.pic1?SID=6&RSS=S'), - (u'Economia', u'http://www.ilgiornale.it/la_s.pic1?SID=5&RSS=S'), - (u'Cultura', u'http://www.ilgiornale.it/la_s.pic1?SID=4&RSS=S'), - (u'Spettacoli', u'http://www.ilgiornale.it/la_s.pic1?SID=14&RSS=S'), - (u'Sport', u'http://www.ilgiornale.it/la_s.pic1?SID=15&RSS=S'), - (u'Tech&Web', u'http://www.ilgiornale.it/la_s.pic1?SID=35&RSS=S'), - (u'Edizione di Roma', u'http://www.ilgiornale.it/roma.pic1?SID=13&RSS=S'), - (u'Edizione di Milano', u'http://www.ilgiornale.it/milano.pic1?SID=9&RSS=S'), - (u'Edizione di Genova', u'http://www.ilgiornale.it/genova.pic1?SID=7&RSS=S') + (u'Ultime Notizie',u'http://www.ilgiornale.it/rss.xml'), + #(u'All\'Interno', u'http://www.ilgiornale.it/la_s.pic1?SID=8&RSS=S'), + #(u'Esteri', u'http://www.ilgiornale.it/la_s.pic1?SID=6&RSS=S'), + #(u'Economia', u'http://www.ilgiornale.it/la_s.pic1?SID=5&RSS=S'), + #(u'Cultura', u'http://www.ilgiornale.it/la_s.pic1?SID=4&RSS=S'), + #(u'Spettacoli', u'http://www.ilgiornale.it/la_s.pic1?SID=14&RSS=S'), + #(u'Sport', u'http://www.ilgiornale.it/la_s.pic1?SID=15&RSS=S'), + #(u'Tech&Web', u'http://www.ilgiornale.it/la_s.pic1?SID=35&RSS=S'), + #(u'Edizione di Roma', u'http://www.ilgiornale.it/roma.pic1?SID=13&RSS=S'), + #(u'Edizione di Milano', u'http://www.ilgiornale.it/milano.pic1?SID=9&RSS=S'), + #(u'Edizione di Genova', u'http://www.ilgiornale.it/genova.pic1?SID=7&RSS=S') ] diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe index e385522714..f115014b5d 100644 --- a/recipes/in4_pl.recipe +++ b/recipes/in4_pl.recipe @@ -1,5 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re +from calibre.ebooks.BeautifulSoup import Comment + class in4(BasicNewsRecipe): title = u'IN4.pl' oldest_article = 7 @@ -8,14 +10,14 @@ class in4(BasicNewsRecipe): description = u'Serwis Informacyjny - Aktualnosci, recenzje' category = 'IT' language = 'pl' - index='http://www.in4.pl/' + index = 'http://www.in4.pl/' #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' no_stylesheets = True remove_empty_feeds = True preprocess_regexps = [(re.compile(ur'<a title="translate into.*?</a>', re.DOTALL), lambda match: '') ] - keep_only_tags=[dict(name='div', attrs={'class':'left_alone'})] - remove_tags_after=dict(name='img', attrs={'title':'komentarze'}) - remove_tags=[dict(name='img', attrs={'title':'komentarze'})] + keep_only_tags = [dict(name='div', attrs={'class':'left_alone'})] + remove_tags_after = dict(name='img', attrs={'title':'komentarze'}) + remove_tags = [dict(name='img', attrs={'title':'komentarze'})] feeds = [(u'Wiadomo\u015bci', u'http://www.in4.pl/rss.php'), (u'Recenzje', u'http://www.in4.pl/rss_recenzje.php'), (u'Mini recenzje', u'http://www.in4.pl/rss_mini.php')] def append_page(self, soup, appendtag): @@ -28,10 +30,13 @@ class in4(BasicNewsRecipe): while nexturl: soup2 = self.index_to_soup(nexturl) pagetext = soup2.find(id='news') + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - nexturl=None - tag=soup2.findAll('a') + nexturl = None + tag = soup2.findAll('a') for z in tag: if z.string and u'następna str' in z.string: nexturl='http://www.in4.pl/' + z['href'] diff --git a/recipes/informacje_usa.recipe b/recipes/informacje_usa.recipe index ac31134103..692dcdc07e 100644 --- a/recipes/informacje_usa.recipe +++ b/recipes/informacje_usa.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -import re class Informacje_USA(BasicNewsRecipe): title = u'Informacje USA' oldest_article = 7 @@ -8,11 +7,10 @@ class Informacje_USA(BasicNewsRecipe): description = u'portal wiadomości amerykańskich' category = 'news' language = 'pl' - masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' - cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' + cover_url='http://www.informacjeusa.com/wp-content/uploads/2013/01/V3BANNER420-90new.jpg' no_stylesheets = True - preprocess_regexps = [(re.compile(ur'<p>Zobacz:.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><a href=".*?Zobacz także:.*?</a></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><p>Zobacz też:.*?</a></p>', re.DOTALL), lambda match: '')] - keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})] - remove_tags_after= dict(attrs={'class':'tags'}) - remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})] + use_embedded_content = False + keep_only_tags=[dict(id='post-area')] + remove_tags_after= dict(id='content-area') + remove_tags= [dict(attrs={'class':['breadcrumb']}), dict(id=['social-box', 'social-box-vert'])] feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')] diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe index e021fa0c17..8c1915db15 100644 --- a/recipes/infra_pl.recipe +++ b/recipes/infra_pl.recipe @@ -15,7 +15,7 @@ class INFRA(BasicNewsRecipe): remove_tags_before=dict(name='h2', attrs={'class':'contentheading'}) remove_tags_after=dict(attrs={'class':'pagenav'}) remove_tags=[dict(attrs={'class':'pagenav'})] - feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')] + feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/rss')] def preprocess_html(self, soup): for item in soup.findAll(style=True): @@ -23,4 +23,4 @@ class INFRA(BasicNewsRecipe): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + return soup diff --git a/recipes/interia_fakty.recipe b/recipes/interia_fakty.recipe index 63f95f3382..74cf56b267 100644 --- a/recipes/interia_fakty.recipe +++ b/recipes/interia_fakty.recipe @@ -20,10 +20,10 @@ class InteriaFakty(BasicNewsRecipe): max_articles_per_feed = 100 feeds = [(u'Kraj', u'http://kanaly.rss.interia.pl/kraj.xml'), - (u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'), - (u'Wiadomo\u015bci dnia', u'http://kanaly.rss.interia.pl/fakty.xml'), - (u'Przegl\u0105d prasy', u'http://kanaly.rss.interia.pl/przeglad_prasy.xml'), - (u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'), + (u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'), + (u'Wiadomo\u015bci dnia', u'http://kanaly.rss.interia.pl/fakty.xml'), + (u'Przegl\u0105d prasy', u'http://kanaly.rss.interia.pl/przeglad_prasy.xml'), + (u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'), (u'Ciekawostki', u'http://kanaly.rss.interia.pl/ciekawostki.xml')] keep_only_tags = [dict(name='div', attrs={'id':'article'})] @@ -35,5 +35,5 @@ class InteriaFakty(BasicNewsRecipe): dict(name='span', attrs={'class':'keywords'})] extra_css = ''' - h2 { font-size: 1.2em; } - ''' + h2 { font-size: 1.2em; } + ''' diff --git a/recipes/interia_sport.recipe b/recipes/interia_sport.recipe index 995dd114a8..dd46b0a4bc 100644 --- a/recipes/interia_sport.recipe +++ b/recipes/interia_sport.recipe @@ -20,13 +20,13 @@ class InteriaSport(BasicNewsRecipe): remove_javascript = True max_articles_per_feed = 100 - feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'), - (u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'), - (u'Siatk\xf3wka', u'http://kanaly.rss.interia.pl/siatkowka.xml'), - (u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'), - (u'NBA', u'http://kanaly.rss.interia.pl/nba.xml'), - (u'Kolarstwo', u'http://kanaly.rss.interia.pl/kolarstwo.xml'), - (u'\u017bu\u017cel', u'http://kanaly.rss.interia.pl/zuzel.xml'), + feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'), + (u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'), + (u'Siatk\xf3wka', u'http://kanaly.rss.interia.pl/siatkowka.xml'), + (u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'), + (u'NBA', u'http://kanaly.rss.interia.pl/nba.xml'), + (u'Kolarstwo', u'http://kanaly.rss.interia.pl/kolarstwo.xml'), + (u'\u017bu\u017cel', u'http://kanaly.rss.interia.pl/zuzel.xml'), (u'Tenis', u'http://kanaly.rss.interia.pl/tenis.xml')] keep_only_tags = [dict(name='div', attrs={'id':'article'})] @@ -63,7 +63,7 @@ class InteriaSport(BasicNewsRecipe): ''' preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'<p><a href.*?</a></p>', lambda match: ''), # FIXME diff --git a/recipes/jazzpress.recipe b/recipes/jazzpress.recipe new file mode 100644 index 0000000000..e11ab70a96 --- /dev/null +++ b/recipes/jazzpress.recipe @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = u'Łukasz Grąbczewski 2011-2013' +__version__ = '2.0' + +import re, os +from calibre import walk +from calibre.utils.zipfile import ZipFile +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe + +class jazzpress(BasicNewsRecipe): + __author__ = u'Łukasz Grąbczewski' + title = 'JazzPRESS' + language = 'pl' + publisher = 'Fundacja Popularyzacji Muzyki Jazzowej EuroJAZZ' + publication_type = 'magazine' + description = u'Internetowa gazeta poświęcona muzyce improwizowanej' + + conversion_options = { + 'authors' : 'Fundacja Popularyzacji Muzyki Jazzowej EuroJAZZ' + ,'publisher' : publisher + ,'language' : language + ,'preserve_cover_aspect_ratio': True + ,'remove_first_image': True + } + + def build_index(self): + browser = self.get_browser() + browser.open('http://radiojazz.fm/') + + # find the link + epublink = browser.find_link(url_regex=re.compile('e_jazzpress\d\d\d\d\_epub')) + + # download ebook + self.report_progress(0,_('Downloading ePUB')) + response = browser.follow_link(epublink) + book_file = PersistentTemporaryFile(suffix='.epub') + book_file.write(response.read()) + book_file.close() + + # convert + self.report_progress(0.2,_('Converting to OEB')) + oeb = self.output_dir + '/INPUT/' + if not os.path.exists(oeb): + os.makedirs(oeb) + with ZipFile(book_file.name) as f: + f.extractall(path=oeb) + + for f in walk(oeb): + if f.endswith('.opf'): + return f # convert + diff --git a/recipes/kath_net.recipe b/recipes/kath_net.recipe index 3b883fde54..9a21b18a7e 100644 --- a/recipes/kath_net.recipe +++ b/recipes/kath_net.recipe @@ -7,12 +7,14 @@ class AdvancedUserRecipe1295262156(BasicNewsRecipe): oldest_article = 7 language = 'de' max_articles_per_feed = 100 + no_stylesheets = True + auto_cleanup = True + encoding='iso-8859-1' feeds = [(u'kath.net', u'http://www.kath.net/2005/xml/index.xml')] - def print_version(self, url): - return url+"&print=yes" + return url+"/print/yes" extra_css = 'td.textb {font-size: medium;}' diff --git a/recipes/kdefamily_pl.recipe b/recipes/kdefamily_pl.recipe new file mode 100644 index 0000000000..75f88b0f3d --- /dev/null +++ b/recipes/kdefamily_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class KDEFamilyPl(BasicNewsRecipe): + title = u'KDEFamily.pl' + __author__ = 'fenuks' + description = u'KDE w Polsce' + category = 'open source, KDE' + language = 'pl' + cover_url = 'http://www.mykde.home.pl/kdefamily/wp-content/uploads/2012/07/logotype-e1341585198616.jpg' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = True + feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')] diff --git a/recipes/kellog_insight.recipe b/recipes/kellog_insight.recipe index db5c7265b9..ddd0ae9adb 100644 --- a/recipes/kellog_insight.recipe +++ b/recipes/kellog_insight.recipe @@ -17,12 +17,14 @@ class KellogInsight(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' language = 'en' + auto_cleanup = True + use_embedded_content = False oldest_article = 60 - keep_only_tags = [dict(name='div', attrs={'id':['print_no_comments']})] + #keep_only_tags = [dict(name='div', attrs={'id':['print_no_comments']})] - remove_tags = [dict(name='div', attrs={'class':'col-three'})] + #remove_tags = [dict(name='div', attrs={'class':'col-three'})] extra_css = ''' h1{font-family:arial; font-size:medium; color:#333333;} diff --git a/recipes/kommersant.recipe b/recipes/kommersant.recipe index 09fb8f8ad8..390ae7d1bd 100644 --- a/recipes/kommersant.recipe +++ b/recipes/kommersant.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>' +__copyright__ = '2010-2013, Darko Miletic <darko.miletic at gmail.com>' ''' www.kommersant.ru ''' @@ -29,17 +29,20 @@ class Kommersant_ru(BasicNewsRecipe): """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language } keep_only_tags = [dict(attrs={'class':['document','document_vvodka','document_text','document_authors vblock']})] remove_tags = [dict(name=['iframe','object','link','img','base','meta'])] - feeds = [(u'Articles', u'http://feeds.kommersant.ru/RSS_Export/RU/daily.xml')] - + feeds = [(u'Articles', u'http://dynamic.feedsportal.com/pf/438800/http://feeds.kommersant.ru/RSS_Export/RU/daily.xml')] + + def get_article_url(self, article): + return article.get('guid', None) + def print_version(self, url): return url.replace('/doc-rss/','/Doc/') + '/Print' \ No newline at end of file diff --git a/recipes/konflikty_zbrojne.recipe b/recipes/konflikty_zbrojne.recipe index 8add89db94..b29e7e243b 100644 --- a/recipes/konflikty_zbrojne.recipe +++ b/recipes/konflikty_zbrojne.recipe @@ -7,7 +7,7 @@ class Konflikty(BasicNewsRecipe): __author__ = 'fenuks' cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg' language = 'pl' - description ='military news' + description = u'Zbiór ciekawych artykułów historycznych, militarnych oraz recenzji książek, gier i filmów. Najświeższe informacje o lotnictwie, wojskach lądowych i polityce.' category='military, history' oldest_article = 7 max_articles_per_feed = 100 diff --git a/recipes/kopalniawiedzy.recipe b/recipes/kopalniawiedzy.recipe index a7b932f618..13703f02ef 100644 --- a/recipes/kopalniawiedzy.recipe +++ b/recipes/kopalniawiedzy.recipe @@ -6,74 +6,75 @@ import re from calibre.web.feeds.recipes import BasicNewsRecipe class KopalniaWiedzy(BasicNewsRecipe): - title = u'Kopalnia Wiedzy' - publisher = u'Kopalnia Wiedzy' - description = u'Ciekawostki ze świata nauki i techniki' - encoding = 'utf-8' - __author__ = 'Attis & Tomasz Długosz' - language = 'pl' - oldest_article = 7 - max_articles_per_feed = 100 - INDEX = u'http://kopalniawiedzy.pl/' - remove_javascript = True - no_stylesheets = True + title = u'Kopalnia Wiedzy' + publisher = u'Kopalnia Wiedzy' + description = u'Ciekawostki ze świata nauki i techniki' + encoding = 'utf-8' + __author__ = 'Attis & Tomasz Długosz' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + INDEX = u'http://kopalniawiedzy.pl/' + remove_javascript = True + remove_empty_feeds = True + no_stylesheets = True - remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}] - remove_tags_after = dict(attrs={'class':'ad-square'}) - keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})] - extra_css = '.topimage {margin-top: 30px}' + remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}] + remove_tags_after = dict(attrs={'class':'ad-square'}) + keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})] + extra_css = '.topimage {margin-top: 30px}' - preprocess_regexps = [ - (re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'), - lambda match: '<img class="topimage" ' + match.group(1) + '>' ), - (re.compile(u'<br /><br />'), - lambda match: '<br\/>') - ] - - feeds = [ - (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'), - (u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'), - (u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'), - (u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'), - (u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'), - (u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss') + preprocess_regexps = [ + (re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'), + lambda match: '<img class="topimage" ' + match.group(1) + '>' ), + (re.compile(u'<br /><br />'), + lambda match: '<br\/>') ] - def is_link_wanted(self, url, tag): - return tag['class'] == 'next' + feeds = [ + (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'), + (u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'), + (u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'), + (u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'), + (u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'), + (u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss') + ] - def remove_beyond(self, tag, next): - while tag is not None and getattr(tag, 'name', None) != 'body': - after = getattr(tag, next) - while after is not None: - ns = getattr(tag, next) - after.extract() - after = ns - tag = tag.parent + def is_link_wanted(self, url, tag): + return tag['class'] == 'next' - def append_page(self, soup, appendtag, position): - pager = soup.find('a',attrs={'class':'next'}) - if pager: - nexturl = self.INDEX + pager['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'id':'articleContent'}) + def remove_beyond(self, tag, next): + while tag is not None and getattr(tag, 'name', None) != 'body': + after = getattr(tag, next) + while after is not None: + ns = getattr(tag, next) + after.extract() + after = ns + tag = tag.parent - tag = texttag.find(attrs={'class':'pages'}) - self.remove_beyond(tag, 'nextSibling') + def append_page(self, soup, appendtag, position): + pager = soup.find('a',attrs={'class':'next'}) + if pager: + nexturl = self.INDEX + pager['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'id':'articleContent'}) - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) + tag = texttag.find(attrs={'class':'pages'}) + self.remove_beyond(tag, 'nextSibling') - appendtag.insert(position,texttag) + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + + appendtag.insert(position,texttag) - def preprocess_html(self, soup): - self.append_page(soup, soup.body, 3) + def preprocess_html(self, soup): + self.append_page(soup, soup.body, 3) - for item in soup.findAll('div',attrs={'class':'pages'}): - item.extract() + for item in soup.findAll('div',attrs={'class':'pages'}): + item.extract() - for item in soup.findAll('p', attrs={'class':'wykop'}): - item.extract() + for item in soup.findAll('p', attrs={'class':'wykop'}): + item.extract() - return soup + return soup diff --git a/recipes/korespondent.recipe b/recipes/korespondent.recipe index aa9cf6e828..fff0946593 100644 --- a/recipes/korespondent.recipe +++ b/recipes/korespondent.recipe @@ -24,17 +24,16 @@ class KorespondentPL(BasicNewsRecipe): extra_css = '.naglowek {font-size: small}\n .tytul {font-size: x-large; padding-bottom: 10px; padding-top: 30px} \n .external {font-size: small}' preprocess_regexps = [ - (re.compile(u'<a href="index\.php.*>(.*)</a>'), - lambda match: match.group(1) ), - (re.compile(u'<i>'), - lambda match:'<i class="external">' ), - (re.compile(u'<p></p>Więcej'), - lambda match:'Więcej' ), - (re.compile(u'target="_blank"'), - lambda match:'target="_blank" class="external"' ), - (re.compile(u'<p align="center">\nPoczytaj inne teksty w <a href="http://www.korespondent.pl">Serwisie wolnorynkowym Korespondent.pl</a>.*</body>', re.DOTALL|re.IGNORECASE), - lambda match: '</div></body>'), - ] + (re.compile(u'<a href="index\.php.*>(.*)</a>'), + lambda match: match.group(1) ), + (re.compile(u'<i>'), + lambda match:'<i class="external">' ), + (re.compile(u'<p></p>Więcej'), + lambda match:'Więcej' ), + (re.compile(u'target="_blank"'), + lambda match:'target="_blank" class="external"' ), + (re.compile(u'<p align="center">\nPoczytaj inne teksty w <a href="http://www.korespondent.pl">Serwisie wolnorynkowym Korespondent.pl</a>.*</body>', re.DOTALL|re.IGNORECASE), + lambda match: '</div></body>'), + ] feeds = [(u'Serwis informacyjny', u'http://korespondent.pl/rss.xml')] - diff --git a/recipes/kosmonauta_pl.recipe b/recipes/kosmonauta_pl.recipe index d1caa85950..98628d667a 100644 --- a/recipes/kosmonauta_pl.recipe +++ b/recipes/kosmonauta_pl.recipe @@ -7,11 +7,13 @@ class Kosmonauta(BasicNewsRecipe): description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.' category = 'astronomy' language = 'pl' - cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg' + cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg' no_stylesheets = True INDEX = 'http://www.kosmonauta.net' oldest_article = 7 no_stylesheets = True + remove_javascript = True + remove_attributes = ['style'] max_articles_per_feed = 100 keep_only_tags = [dict(name='div', attrs={'class':'item-page'})] remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']})] @@ -24,6 +26,5 @@ class Kosmonauta(BasicNewsRecipe): href = a['href'] if not href.startswith('http'): a['href'] = self.INDEX + href - print '%%%%%%%%%%%%%%%%%%%%%%%%%', a['href'] return soup - \ No newline at end of file + diff --git a/recipes/kp.recipe b/recipes/kp.recipe index f52fcef60b..85bf356b4d 100644 --- a/recipes/kp.recipe +++ b/recipes/kp.recipe @@ -47,6 +47,6 @@ class KrytykaPolitycznaRecipe(BasicNewsRecipe): def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) + tstr = alink.string + alink.replaceWith(tstr) return soup diff --git a/recipes/kurier_galicyjski.recipe b/recipes/kurier_galicyjski.recipe new file mode 100644 index 0000000000..063bef8972 --- /dev/null +++ b/recipes/kurier_galicyjski.recipe @@ -0,0 +1,60 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs, Comment + +class KurierGalicyjski(BasicNewsRecipe): + title = u'Kurier Galicyjski' + __author__ = 'fenuks' + description = u'Kurier Galicyjski - największa gazeta dla Polaków na Ukrainie. Bieżące wydarzenia z życia polskiej mniejszości, historia, kultura, polityka, reportaże.' + category = 'news' + language = 'pl' + cover_url = 'http://www.duszki.pl/Kurier_galicyjski_bis2_small.gif' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + keep_only_tags = [dict(attrs={'class':'item-page'})] + remove_tags = [dict(attrs={'class':'pagenav'}), dict(attrs={'style':'border-top-width: thin; border-top-style: dashed; border-top-color: #CCC; border-bottom-width: thin; border-bottom-style: dashed; border-bottom-color: #CCC; padding-top:5px; padding-bottom:5px; text-align:right; margin-top:10px; height:20px;'})] + feeds = [(u'Wydarzenia', u'http://kuriergalicyjski.com/index.php/wydarzenia?format=feed&type=atom'), (u'Publicystyka', u'http://kuriergalicyjski.com/index.php/niezwykle-historie?format=feed&type=atom'), (u'Reporta\u017ce', u'http://kuriergalicyjski.com/index.php/report?format=feed&type=atom'), (u'Rozmowy Kuriera', u'http://kuriergalicyjski.com/index.php/kuriera?format=feed&type=atom'), (u'Przegl\u0105d prasy', u'http://kuriergalicyjski.com/index.php/2012-01-05-14-08-55?format=feed&type=atom'), (u'Kultura', u'http://kuriergalicyjski.com/index.php/2011-12-02-14-26-39?format=feed&type=atom'), (u'Zabytki', u'http://kuriergalicyjski.com/index.php/2011-12-02-14-27-32?format=feed&type=atom'), (u'Polska-Ukraina', u'http://kuriergalicyjski.com/index.php/pol-ua?format=feed&type=atom'), (u'Polacy i Ukrai\u0144cy', u'http://kuriergalicyjski.com/index.php/polacy-i-ukr?format=feed&type=atom'), (u'Niezwyk\u0142e historie', u'http://kuriergalicyjski.com/index.php/niezwykle-historie?format=feed&type=atom'), (u'Polemiki', u'http://kuriergalicyjski.com/index.php/polemiki?format=feed&type=atom')] + + def append_page(self, soup, appendtag): + pager = soup.find(id='article-index') + if pager: + pager = pager.findAll('a')[1:] + if pager: + for a in pager: + nexturl = 'http://www.kuriergalicyjski.com' + a['href'] + soup2 = self.index_to_soup(nexturl) + pagetext = soup2.find(attrs={'class':'item-page'}) + if pagetext.h2: + pagetext.h2.extract() + r = pagetext.find(attrs={'class':'article-info'}) + if r: + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pos = len(appendtag.contents) + for r in appendtag.findAll(id='article-index'): + r.extract() + for r in appendtag.findAll(attrs={'class':'pagenavcounter'}): + r.extract() + for r in appendtag.findAll(attrs={'class':'pagination'}): + r.extract() + for r in appendtag.findAll(attrs={'class':'pagenav'}): + r.extract() + for r in appendtag.findAll(attrs={'style':'border-top-width: thin; border-top-style: dashed; border-top-color: #CCC; border-bottom-width: thin; border-bottom-style: dashed; border-bottom-color: #CCC; padding-top:5px; padding-bottom:5px; text-align:right; margin-top:10px; height:20px;'}): + r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + for r in soup.findAll(style=True): + del r['style'] + for img in soup.findAll(attrs={'class':'easy_img_caption smartresize'}): + img.insert(len(img.contents)-1, bs('<br />')) + img.insert(len(img.contents), bs('<br /><br />')) + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://kuriergalicyjski.com' + a['href'] + return soup diff --git a/recipes/kurier_lubelski.recipe b/recipes/kurier_lubelski.recipe new file mode 100644 index 0000000000..4ae40b0eb5 --- /dev/null +++ b/recipes/kurier_lubelski.recipe @@ -0,0 +1,34 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class KurierLubelski(BasicNewsRecipe): + title = u'Kurier Lubelski' + __author__ = 'fenuks' + description = u'Gazeta Regionalna Kurier Lubelski. Najnowsze Wiadomości Lublin. Czytaj Informacje Lublin!' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/kurierlubelski.png?24' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] + remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] + + feeds = [(u'Wiadomo\u015bci', u'http://kurierlubelski.feedsportal.com/c/32980/f/533785/index.rss?201302'), (u'Region', u'http://www.kurierlubelski.pl/rss/kurierlubelski_region.xml?201302'), (u'Sport', u'http://kurierlubelski.feedsportal.com/c/32980/f/533786/index.rss?201302'), (u'Kultura', u'http://kurierlubelski.feedsportal.com/c/32980/f/533787/index.rss?201302'), (u'Rozmaito\u015bci', u'http://www.kurierlubelski.pl/rss/kurierlubelski_rozmaitosci.xml?201302'), (u'Dom', u'http://www.kurierlubelski.pl/rss/kurierlubelski_dom.xml?201302'), (u'Serwisy', u'http://www.kurierlubelski.pl/rss/kurierlubelski_serwisy.xml?201302'), (u'Motofakty', u'http://www.kurierlubelski.pl/rss/kurierlubelski_motofakty.xml?201302'), (u'M\xf3j Reporter', u'http://www.kurierlubelski.pl/rss/kurierlubelski_mojreporter.xml?201302'), (u'Praca', u'http://www.kurierlubelski.pl/rss/kurierlubelski_praca.xml?201302')] + + def print_version(self, url): + return url.replace('artykul', 'drukuj') + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/kurier-lubelski/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/kurier_poranny.recipe b/recipes/kurier_poranny.recipe new file mode 100644 index 0000000000..5c2f2d8e1e --- /dev/null +++ b/recipes/kurier_poranny.recipe @@ -0,0 +1,84 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +class KurierPoranny(BasicNewsRecipe): + title = u'Kurier Poranny' + __author__ = 'fenuks' + description = u'Kurier Poranny | poranny.pl - portal miejski Białegostoku,informacje,wydarzenia' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + INDEX = 'http://www.poranny.pl' + masthead_url = INDEX + '/images/top_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} + + preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] + remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', + 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', + 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), + dict(attrs={'class':'articleFunctions'})] + + + feeds = [(u'Wszystkie', u'http://www.poranny.pl/rss.xml'), + (u'Białystok', u'http://www.poranny.pl/bialystok.xml'), + (u'Bielsk Podlaski', u'http://www.poranny.pl/bielskpodlaski.xml'), + (u'Czarna Białostocka', u'http://www.poranny.pl/czarnabialostocka.xml'), + (u'Hajnówka', u'http://www.poranny.pl/hajnowka.xml'), + (u'Łapy', u'http://www.poranny.pl/lapy.xml'), + (u'Sokółka', u'http://www.poranny.pl/sokolka.xml'), + (u'Supraśl', u'http://www.poranny.pl/suprasl.xml'), + (u'Wasilków', u'http://www.poranny.pl/wasilkow.xml'), + (u'Sport', u'http://www.poranny.pl/sport.xml'), + (u'Praca', u'http://www.poranny.pl/praca.xml'), + (u'Kultura', u'http://www.poranny.pl/kultura.xml'), + (u'Dom', u'http://www.poranny.pl/dom.xml'), + (u'Auto', u'http://www.poranny.pl/auto.xml'), + (u'Polityka', u'http://www.poranny.pl/polityka.xml')] + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') + nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] + soup = self.index_to_soup(nexturl) + self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] + return getattr(self, 'cover_url', self.cover_url) + + def append_page(self, soup, appendtag): + tag = soup.find('span', attrs={'class':'photoNavigationPages'}) + if tag: + number = int(tag.string.rpartition('/')[-1].replace(' ', '')) + baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] + + for r in appendtag.findAll(attrs={'class':'photoNavigation'}): + r.extract() + for nr in range(2, number+1): + soup2 = self.index_to_soup(baseurl + str(nr)) + pagetext = soup2.find(id='photoContainer') + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoMeta'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoStoryText'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/kurier_szczecinski.recipe b/recipes/kurier_szczecinski.recipe new file mode 100644 index 0000000000..a4f9b115e8 --- /dev/null +++ b/recipes/kurier_szczecinski.recipe @@ -0,0 +1,27 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class KurierSzczecinski(BasicNewsRecipe): + title = u'Kurier Szczeci\u0144ski' + __author__ = 'fenuks' + description = u'24Kurier jest portalem Kuriera Szczecińskiego. Zawiera aktualności ze Szczecina oraz wiadomości regionalne z województwa zachodniopomorskiego.' + category = 'newspaper' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://www.24kurier.pl/Administracja/Img/24kurier_logo-copy-po-zapis' + #masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'section'})] + remove_tags = [dict(attrs={'class':['Ikonki', 'rek', 'artComments']})] + remove_tags_after = dict(attrs={'class':'artComments'}) + #remove_tags_before = dict() + feeds = [(u'Aktualno\u015bci', u'http://www.24kurier.pl/cmspages/articles_rss.aspx'), (u'Kraj', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kraj'), (u'\u015awiat', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swiat'), (u'Sport', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=sport'), (u'Kultura', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kultura'), (u'Gospodarka', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gospodarka'), (u'Nauka', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=nauka'), (u'Region', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=region'), (u'Szczecin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=szczecin'), (u'Bia\u0142ogard', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=bialogard'), (u'Choszczno', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=choszczno'), (u'Drawsko', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=drawsko'), (u'Goleni\xf3w', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=goleniow'), (u'Gryfice', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gryfice'), (u'Gryfino', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=gryfino'), (u'Kamie\u0144 Pomorski', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kamien'), (u'Ko\u0142obrzeg', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=kolobrzeg'), (u'Koszalin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=koszalin'), (u'\u0141obez', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=lobez'), (u'My\u015blib\xf3rz', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=mysliborz'), (u'Police', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=police'), (u'Pyrzyce', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=pyrzyce'), (u'S\u0142awno', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=slawno'), (u'Stargard', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=stargard'), (u'Szczecinek', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=szczecinek'), (u'\u015awidwin', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swidwin'), (u'\u015awinouj\u015bcie', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=swinoujscie'), (u'Wa\u0142cz', u'http://www.24kurier.pl/cmspages/articles_rss.aspx?dzial=walcz')] diff --git a/recipes/kyungyhang b/recipes/kyungyhang.recipe similarity index 91% rename from recipes/kyungyhang rename to recipes/kyungyhang.recipe index ac658b1cab..7fe5b88612 100644 --- a/recipes/kyungyhang +++ b/recipes/kyungyhang.recipe @@ -16,14 +16,14 @@ class Kyungyhang(BasicNewsRecipe): max_articles_per_feed = 20 no_stylesheets = True remove_javascript = True - + keep_only_tags = [ dict(name='div', attrs ={'class':['article_title_wrap']}), - dict(name='div', attrs ={'class':['article_txt']}) + dict(name='span', attrs ={'class':['article_txt']}) ] remove_tags_after = dict(id={'sub_bottom'}) - + remove_tags = [ dict(name='iframe'), dict(id={'TdHot'}), @@ -31,7 +31,7 @@ class Kyungyhang(BasicNewsRecipe): dict(name='dl', attrs={'class':['CL']}), dict(name='ul', attrs={'class':['tab']}), ] - + feeds = [ ('All News','http://www.khan.co.kr/rss/rssdata/total_news.xml'), - ] \ No newline at end of file + ] diff --git a/recipes/lamebook.recipe b/recipes/lamebook.recipe new file mode 100644 index 0000000000..e449285d84 --- /dev/null +++ b/recipes/lamebook.recipe @@ -0,0 +1,27 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class LamebookRecipe(BasicNewsRecipe): + title = 'Lamebook' + __author__ = 'atordo' + description = 'Funny Facebook Statuses, Fails, LOLs and More - The Original' + cover_url = 'http://www.lamebook.com/wp-content/themes/lamebook/images/h1-new2.png' + oldest_article = 7 + max_articles_per_feed = 50 + auto_cleanup = False + no_stylesheets = True + remove_javascript = True + language = 'en' + use_embedded_content = False + publication_type = 'blog' + + keep_only_tags = [ + dict(name='div', attrs={'class':'entry'}) + ,dict(name='ol', attrs={'class':'commentlist'}) + ] + + remove_tags = [ + dict(name='div', attrs={'style':['clear: left; float: left; margin: 0 15px 0 0;' + ,'float: left; margin: 0 15px 0 0;']}) + ] + + feeds = [('Lamebook', 'http://feeds.feedburner.com/Lamebook')] diff --git a/recipes/le_monde_sub.recipe b/recipes/le_monde_sub.recipe index 56156166dc..dc9fa9d36f 100644 --- a/recipes/le_monde_sub.recipe +++ b/recipes/le_monde_sub.recipe @@ -1,166 +1,94 @@ -#!/usr/bin/env python - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +__author__ = 'Sylvain Durand <sylvain.durand@ponts.org>' __license__ = 'GPL v3' -__copyright__ = '2012, 2013, Rémi Vanicat <vanicat at debian.org>' -''' -Lemonde.fr: Version abonnée -''' +import time -import os, zipfile, re, time -from urllib2 import HTTPError -from calibre.constants import preferred_encoding - +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ptempfile import PersistentTemporaryFile +from urllib2 import HTTPError -class LeMondeAbonne(BasicNewsRecipe): +class LeMonde(BasicNewsRecipe): - title = u'Le Monde: Édition abonnés' - __author__ = u'Rémi Vanicat' - description = u'Actualités' - category = u'Actualités, France, Monde' - publisher = 'Le Monde' - language = 'fr' - needs_subscription = True - no_stylesheets = True - smarten_punctuation = True - remove_attributes = [ 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', 'valign', 'vscape', 'hspace', 'alt', 'width', 'height'] - extra_css = ''' li{margin:6pt 0} - ul{margin:0} + title = u'Le Monde: Édition abonnés' + __author__ = 'Sylvain Durand' + description = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.' + language = 'fr' + encoding = 'utf8' - div.photo img{max-width:100%; border:0px transparent solid;} - div.photo{font-family:inherit; color:#333; text-align:center;} - div.photo p{text-align:justify;font-size:.9em; line-height:.9em;} + needs_subscription = True - @page{margin:10pt} - .ar-txt {color:#000; text-align:justify;} - h1{text-align:left; font-size:1.25em;} + date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html' + login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml' + masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/300px-Le_Monde_logo.svg.png' + couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg' - .auteur{text-align:right; font-weight:bold} - .feed{text-align:right; font-weight:bold} - .po-ti2{font-weight:bold} - .fen-tt{font-weight:bold;font-size:1.1em} - ''' + extra_css = ''' + img{max-width:100%} + h1{font-size:1.2em !important; line-height:1.2em !important; } + h2{font-size:1em !important; line-height:1em !important; } + h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} + #photo{text-align:center !important; margin:10px 0 -8px;} + #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' - zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip' - coverurl_format = '/img/%y%m%d01.jpg' - path_format = "%y%m%d" - login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + keep_only_tags = [dict(name=['h1','h2','h3','div','txt'])] - keep_only_tags = [dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ] - - - remove_tags = [ dict(name='div', attrs={ 'class': 'po-ti' }),dict(name='div', attrs={ 'class': 'po-copy' })] - - article_id_pattern = re.compile("[0-9]+\\.html") - article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' + def __init__(self, options, log, progress_reporter): + BasicNewsRecipe.__init__(self, options, log, progress_reporter) + br = BasicNewsRecipe.get_browser(self) + second = time.time() + 24*60*60 + for i in range(7): + self.date = time.gmtime(second) + try: + br.open(time.strftime(self.date_url,self.date)) + break + except HTTPError: + second -= 24*60*60 + self.timefmt = strftime(u" %A %d %B %Y", self.date).replace(u' 0', u' ') def get_browser(self): br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open(self.login_url) - br.select_form(nr=0) - br['login'] = self.username - br['password'] = self.password - br.submit() + br.open(self.login_url) + br.select_form(nr=0) + br['login'] = self.username + br['password'] = self.password + br.submit() return br - decalage = 24 * 60 * 60 # today Monde has tomorow date - def get_cover_url(self): - url = time.strftime(self.coverurl_format, self.ltime) - return self.articles_path + url + url = time.strftime(self.couverture_url,self.date) + return url def parse_index(self): - browser = self.get_browser() - - second = time.time() - second += self.decalage - - for i in range(7): - self.ltime = time.gmtime(second) - self.timefmt=time.strftime(" %A %d %B %Y",self.ltime).decode(preferred_encoding) - url = time.strftime(self.zipurl_format,self.ltime) - try: - response = browser.open(url) - continue - except HTTPError: - second -= 24*60*60 - - tmp = PersistentTemporaryFile(suffix='.zip') - self.report_progress(0.1,_('downloading zip file')) - tmp.write(response.read()) - tmp.close() - - zfile = zipfile.ZipFile(tmp.name, 'r') - self.report_progress(0.1,_('extracting zip file')) - - zfile.extractall(self.output_dir) - zfile.close() - - path = os.path.join(self.output_dir, time.strftime(self.path_format, self.ltime), "data") - - self.articles_path = path - - files = os.listdir(path) - - nb_index_files = len([ name for name in files if re.match("frame_gauche_[0-9]+.html", name) ]) - - flux = [] - - article_url = time.strftime(self.article_url_format, self.ltime) - - for i in range(nb_index_files): - filename = os.path.join(path, "selection_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup=BeautifulSoup(tmp,convertEntities=BeautifulSoup.HTML_ENTITIES) - title=soup.find('span').contents[0] - if title=="Une": - title="À la une" - if title=="Evenement": - title="L'événement" - if title=="Planete": - title="Planète" - if title=="Economie - Entreprises": - title="Économie" - if title=="L'Oeil du Monde": - title="L'œil du Monde" - if title=="Enquete": - title="Enquête" - if title=="Editorial - Analyses": - title="Analyses" - if title=="Le Monde Economie": - title="Économie" - if title=="Le Monde Culture et idées": - title="Idées" - if title=="Le Monde Géo et politique": - title="Géopolitique" - tmp.close() - - filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup = BeautifulSoup(tmp) + url = time.strftime(self.journal_url,self.date) + soup = self.index_to_soup(url).sommaire + sections = [] + for sec in soup.findAll("section"): articles = [] - for link in soup.findAll("a"): - article_file = link['href'] - article_id=self.article_id_pattern.search(article_file).group() - article = { - 'title': link.contents[0], - 'url': article_url + article_id, - 'description': '', - 'content': '' - } - articles.append(article) - tmp.close() + if sec['cahier'] != "Le Monde": + for col in sec.findAll("fnts"): + col.extract() + if sec['cahier']=="Le Monde Magazine": + continue + for art in sec.findAll("art"): + if art.txt.string and art.ttr.string: + if art.find(['url']): + art.insert(6,'<div id="photo"><img src="'+art.find(['url']).string+'" /></div>') + if art.find(['lgd']) and art.find(['lgd']).string: + art.insert(7,'<div id="lgd">'+art.find(['lgd']).string+'</div>') + article = "<html><head></head><body>"+unicode(art)+"</body></html>" + article = article.replace('<![CDATA[','').replace(']]>','').replace(' oC ','°C ') + article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>') + f = PersistentTemporaryFile() + f.write(article) + articles.append({'title':art.ttr.string,'url':"file:///"+f.name}) + sections.append((sec['nom'], articles)) + return sections - flux.append((title, articles)) + def preprocess_html(self, soup): + for lgd in soup.findAll(id="lgd"): + lgd.contents[-1].extract() + return soup - return flux - - - -# Local Variables: -# mode: python -# End: diff --git a/recipes/ledevoir.recipe b/recipes/ledevoir.recipe index f79c010804..6208229386 100644 --- a/recipes/ledevoir.recipe +++ b/recipes/ledevoir.recipe @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __author__ = 'Lorenzo Vigentini and Olivier Daigle' __copyright__ = '2012, Lorenzo Vigentini <l.vigentini at gmail.com>, Olivier Daigle <odaigle _at nuvucameras __dot__ com>' __version__ = 'v1.01' -__date__ = '12, February 2012' +__date__ = '22, December 2012' __description__ = 'Canadian Paper ' ''' @@ -32,41 +32,50 @@ class ledevoir(BasicNewsRecipe): recursion = 10 needs_subscription = 'optional' + filterDuplicates = False url_list = [] remove_javascript = True no_stylesheets = True - auto_cleanup = True preprocess_regexps = [(re.compile(r'(title|alt)=".*?>.*?"', re.DOTALL), lambda m: '')] - #keep_only_tags = [ + keep_only_tags = [ #dict(name='div', attrs={'id':'article_detail'}), - #dict(name='div', attrs={'id':'colonne_principale'}) - #] + #dict(name='div', attrs={'id':'colonne_principale'}), + dict(name='article', attrs={'id':'article', 'class':'clearfix'}), + dict(name='article', attrs={'id':'article', 'class':'clearfix portrait'}) + ] - #remove_tags = [ - #dict(name='div', attrs={'id':'dialog'}), - #dict(name='div', attrs={'class':['interesse_actions','reactions','taille_du_texte right clearfix','partage_sociaux clearfix']}), - #dict(name='aside', attrs={'class':['article_actions clearfix','reactions','partage_sociaux_wrapper']}), - #dict(name='ul', attrs={'class':'mots_cles'}), - #dict(name='ul', attrs={'id':'commentaires'}), - #dict(name='a', attrs={'class':'haut'}), - #dict(name='h5', attrs={'class':'interesse_actions'}) - #] + remove_tags = [ + dict(name='div', attrs={'id':'prive'}), + dict(name='div', attrs={'class':'acheter_article'}), + dict(name='div', attrs={'id':'col_complement'}), + dict(name='div', attrs={'id':'raccourcis','class':'clearfix'}), + dict(name='div', attrs={'id':'dialog'}), + dict(name='div', attrs={'id':'liste_photos_article','class':'clearfix'}), + dict(name='script', attrs={'type':'text/javascript'}), + dict(name='div', attrs={'class':['interesse_actions','reactions','taille_du_texte right clearfix','partage_sociaux clearfix']}), + dict(name='aside', attrs={'class':['article_actions clearfix','partage_sociaux_wrapper']}), + dict(name='aside', attrs={'class':'reactions', 'id':'reactions'}), + dict(name='ul', attrs={'class':'mots_cles'}), + dict(name='ul', attrs={'id':'commentaires'}), + dict(name='a', attrs={'class':'haut'}), + dict(name='h5', attrs={'class':'interesse_actions'}) + ] feeds = [ (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'), - (u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), - (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'), - (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'), - (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'), - (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'), - (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'), - (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'), - (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'), - (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'), - (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'), +# (u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), +# (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'), +# (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'), +# (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'), +# (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'), +# (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'), +# (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'), +# (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'), +# (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'), +# (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'), (u'Art de vivre', 'http://www.ledevoir.com/rss/section/art-de-vivre.xml?id=50') ] @@ -97,4 +106,10 @@ class ledevoir(BasicNewsRecipe): br.submit() return br + def print_version(self, url): + if self.filterDuplicates: + if url in self.url_list: + return + self.url_list.append(url) + return url diff --git a/recipes/legeartis.recipe b/recipes/legeartis.recipe index 8365d3639d..1b882c26d7 100644 --- a/recipes/legeartis.recipe +++ b/recipes/legeartis.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python -__license__ = 'GPL v3' +__license__ = 'GPL v3' __author__ = 'Mori' __version__ = 'v. 0.1' ''' @@ -10,34 +10,34 @@ olgierd.bblog.pl from calibre.web.feeds.news import BasicNewsRecipe class LegeArtisRecipe(BasicNewsRecipe): - __author__ = 'Mori' - language = 'pl' + __author__ = 'Mori' + language = 'pl' - title = u'Lege Artis' - publisher = u'Olgierd Rudak' - description = u'Wszystko, co chcieliby\u015bcie wiedzie\u0107 o prawie, ale wstydzicie si\u0119 zapyta\u0107' + title = u'Lege Artis' + publisher = u'Olgierd Rudak' + description = u'Wszystko, co chcieliby\u015bcie wiedzie\u0107 o prawie, ale wstydzicie si\u0119 zapyta\u0107' - max_articles_per_feed = 100 - - no_stylesheets = True - remove_javascript = True - - extra_css = ''' - img{clear: both;} - ''' - - feeds = [ - (u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml') - ] - - keep_only_tags = [ - dict(name = 'div', attrs = {'class' : 'post_title'}), - dict(name = 'div', attrs = {'class' : 'post_date'}), - dict(name = 'div', attrs = {'class' : 'post_content'}) - ] - - remove_tags = [ - dict(name = 'div', attrs = {'id' : 'bb_tools'}), - dict(name = 'div', attrs = {'class' : 'post_comments'}), - dict(name = 'object', attrs = {}) - ] + max_articles_per_feed = 100 + + no_stylesheets = True + remove_javascript = True + + extra_css = ''' + img{clear: both;} + ''' + + feeds = [ + (u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml') + ] + + keep_only_tags = [ + dict(name = 'div', attrs = {'class' : 'post_title'}), + dict(name = 'div', attrs = {'class' : 'post_date'}), + dict(name = 'div', attrs = {'class' : 'post_content'}) + ] + + remove_tags = [ + dict(name = 'div', attrs = {'id' : 'bb_tools'}), + dict(name = 'div', attrs = {'class' : 'post_comments'}), + dict(name = 'object', attrs = {}) + ] diff --git a/recipes/legitymizm.recipe b/recipes/legitymizm.recipe index e54d2beefd..b135cefae6 100644 --- a/recipes/legitymizm.recipe +++ b/recipes/legitymizm.recipe @@ -32,7 +32,7 @@ class Legitymizm(BasicNewsRecipe): #szeroka_kolumna ul.wykaz { list-style-type: none; margin: 0 0 1.2em 0; padding: 0; } #szeroka_kolumna ul.wykaz li.wykaz_2 { font-weight: bold; margin: 0.6em 0 0 0; } #szeroka_kolumna ul.wykaz a { text-decoration: none; } - #szeroka_kolumna ul.wykaz li.wykaz_1, #szeroka_kolumna ul.wykaz li.wykaz_2 ul li { list-style-type: square; color: #898981; text-transform: none; font-weight: normal; padding: 0; } + #szeroka_kolumna ul.wykaz li.wykaz_1, #szeroka_kolumna ul.wykaz li.wykaz_2 ul li { list-style-type: square; color: #898981; text-transform: none; font-weight: normal; padding: 0; } #szeroka_kolumna ul.wykaz li.wykaz_1 { margin: 0 0 0 1.3em; } #szeroka_kolumna ul.wykaz li.wykaz_2 ul { margin: 0; padding: 0 0 0 1.3em; } #szeroka_kolumna h3.autor { background-color: #898981; color: #f9f9f8; margin: -25px 0px 30px 0; text-align: left; padding: 0 0 0 2px; } @@ -47,4 +47,3 @@ class Legitymizm(BasicNewsRecipe): #cytat p.sentencja:first-letter { font-size: 44px; line-height: 33px; margin: 0 2px 0 0; font-style: normal; float: left; display: block; } p.autor { text-transform: uppercase; color: #898981; font-style: normal; text-align: left; } ''' - diff --git a/recipes/lifehacker_pl.recipe b/recipes/lifehacker_pl.recipe new file mode 100644 index 0000000000..57b6f8a571 --- /dev/null +++ b/recipes/lifehacker_pl.recipe @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = 'MrStefan' + +''' +www.lifehacking.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class lifehacking(BasicNewsRecipe): + title = u'Lifehacker Polska' + __author__ = 'MrStefan <mrstefaan@gmail.com>' + language = 'pl' + description =u'Lifehacking - sposoby na zwiększanie własnej wydajności. Ułatwiaj sobie życie, wykorzystując wiedzę, metody, technologie, przydatne strony ...' + masthead_url='http://lifehacking.pl/wp-content/themes/lifehacking/images/lifehackerlogo.png' + remove_empty_feeds= True + oldest_article = 7 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + feeds = [(u'Lifehacker polska', u'http://feeds.feedburner.com/pl_lh')] diff --git a/recipes/linux_journal.recipe b/recipes/linux_journal.recipe index 99b1a570dc..2a8c7dd2e9 100644 --- a/recipes/linux_journal.recipe +++ b/recipes/linux_journal.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment class LinuxJournal(BasicNewsRecipe): title = u'Linux Journal' @@ -25,6 +26,9 @@ class LinuxJournal(BasicNewsRecipe): soup2 = self.index_to_soup('http://www.linuxjournal.com'+ nexturl) pagetext = soup2.find(attrs={'class':'node-inner'}).find(attrs={'class':'content'}) next = appendtag.find('li', attrs={'class':'pager-next'}) + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) tag = appendtag.find('div', attrs={'class':'links'}) @@ -33,4 +37,4 @@ class LinuxJournal(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) - return soup \ No newline at end of file + return soup diff --git a/recipes/lomza.recipe b/recipes/lomza.recipe index d7e224d13d..2c31271624 100644 --- a/recipes/lomza.recipe +++ b/recipes/lomza.recipe @@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Lomza(BasicNewsRecipe): title = u'4Lomza' __author__ = 'fenuks' - description = u'4Łomża - regional site' + description = u'Regionalny portal. Najświeższe informacje z regionu, kulturalne, sportowe. Ogłoszenia, baza biznesu, forum.' cover_url = 'http://www.4lomza.pl/i/logo4lomza_m.jpg' language = 'pl' oldest_article = 15 diff --git a/recipes/michalkiewicz.recipe b/recipes/michalkiewicz.recipe index 7a1e424f0b..4e4aee0db6 100644 --- a/recipes/michalkiewicz.recipe +++ b/recipes/michalkiewicz.recipe @@ -8,8 +8,6 @@ michalkiewicz.pl from calibre.web.feeds.news import BasicNewsRecipe -# - class michalkiewicz(BasicNewsRecipe): title = u'Stanis\u0142aw Michalkiewicz' description = u'Strona autorska * felietony * artyku\u0142y * komentarze' @@ -23,4 +21,3 @@ class michalkiewicz(BasicNewsRecipe): remove_tags = [dict(name='ul', attrs={'class':'menu'})] feeds = [(u'Teksty', u'http://www.michalkiewicz.pl/rss.xml')] - diff --git a/recipes/midday.recipe b/recipes/midday.recipe deleted file mode 100644 index 4dbee1d2f3..0000000000 --- a/recipes/midday.recipe +++ /dev/null @@ -1,13 +0,0 @@ -from calibre.web.feeds.news import CalibrePeriodical - -class MiDDay(CalibrePeriodical): - - title = 'MiDDay' - calibre_periodicals_slug = 'midday' - - description = '''Get your dose of the latest news, views and fun - from the - world of politics, sports and Bollywood to the cartoons, comics and games of - the entertainment section - India’s leading tabloid has it all. To subscribe - visit <a href="http://news.calibre-ebook.com/periodical/midday">calibre - Periodicals</a>.''' - language = 'en_IN' diff --git a/recipes/mlody_technik_pl.recipe b/recipes/mlody_technik_pl.recipe index 741397d08a..4622e73909 100644 --- a/recipes/mlody_technik_pl.recipe +++ b/recipes/mlody_technik_pl.recipe @@ -1,5 +1,5 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - +import re from calibre.web.feeds.news import BasicNewsRecipe class Mlody_technik(BasicNewsRecipe): title = u'Młody technik' @@ -7,9 +7,28 @@ class Mlody_technik(BasicNewsRecipe): description = u'Młody technik' category = 'science' language = 'pl' - cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg' + #cover_url = 'http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg' no_stylesheets = True + preprocess_regexps = [(re.compile(r"<h4>Podobne</h4>", re.IGNORECASE), lambda m: '')] oldest_article = 7 max_articles_per_feed = 100 - #keep_only_tags=[dict(id='container')] - feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')] + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id='content')] + remove_tags = [dict(attrs={'class':'st-related-posts'})] + remove_tags_after = dict(attrs={'class':'entry-content clearfix'}) + feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'), + #(u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'), + (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'), + (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'), + (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'), + (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'), + (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'), + (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')] + + def get_cover_url(self): + soup = self.index_to_soup('http://www.mt.com.pl/') + tag = soup.find(attrs={'class':'xoxo'}) + if tag: + self.cover_url = tag.find('img')['src'] + return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/money_pl.recipe b/recipes/money_pl.recipe index 475c2059ff..8171d05a07 100644 --- a/recipes/money_pl.recipe +++ b/recipes/money_pl.recipe @@ -60,7 +60,7 @@ class FocusRecipe(BasicNewsRecipe): ] def print_version(self, url): - if url.count ('money.pl.feedsportal.com'): + if url.count ('money.pl.feedsportal.com'): u = url.find('0Cartykul0C') u = 'http://www.m.money.pl/wiadomosci/artykul/' + url[u + 21:] u = u.replace('0C', '/') @@ -71,6 +71,6 @@ class FocusRecipe(BasicNewsRecipe): u = u.replace ('0B','.') u = u.replace (',0,',',-1,') u = u.replace('0Tutm0Isource0Frss0Gutm0Imedium0Frss0Gutm0Icampaign0Frss/story01.htm', '') - else: + else: u = url.replace('/nc/1','/do-druku/1') - return u + return u diff --git a/recipes/more_intelligent_life.recipe b/recipes/more_intelligent_life.recipe new file mode 100644 index 0000000000..e90f883080 --- /dev/null +++ b/recipes/more_intelligent_life.recipe @@ -0,0 +1,67 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>' +''' +moreintelligentlife.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class MoreIntelligentLife(BasicNewsRecipe): + title = 'More Intelligent Life' + __author__ = 'Darko Miletic' + description = "More Intelligent Life (moreintelligentlife.com) is the online version of Intelligent Life, a lifestyle and culture magazine from The Economist. The website offers not only content from the print edition, trickled out over the course of its shelf-life, but also the Editors' Blog, which carries daily posts from the editorial team-quickfire observations and opinions that allow readers to eavesdrop on the conversation in the office." + publisher = 'The Economist Newspaper ltd' + category = 'arts,lifestyle,intelligent life,the economist,ideas,style,culture' + oldest_article = 60 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + remove_empty_feeds = True + publication_type = 'website' + extra_css = """ + body{font-family: Arial,"Helvetica neue","Bitstream Vera Sans",sans-serif} + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = [dict(attrs={'class':'node'})] + remove_tags_after = dict(attrs={'class':'tags'}) + remove_tags = [dict(name=['meta','link','iframe','embed','object'])] + remove_attributes = ['lang'] + + feeds = [(u'Articles', u'http://feeds.feedburner.com/MoreintelligentlifeTotal')] + + def get_cover_url(self): + soup = self.index_to_soup('http://moreintelligentlife.com/') + for image in soup.findAll('img', src=True): + if image['src'].startswith('http://moreintelligentlife.com/files/covers/current_issue_'): + return image['src'] + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/myapple_pl.recipe b/recipes/myapple_pl.recipe index df5708a325..6f7f37633e 100644 --- a/recipes/myapple_pl.recipe +++ b/recipes/myapple_pl.recipe @@ -44,6 +44,6 @@ class MyAppleRecipe(BasicNewsRecipe): def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) + tstr = alink.string + alink.replaceWith(tstr) return soup diff --git a/recipes/naszdziennik.recipe b/recipes/naszdziennik.recipe index 4c7b78c199..7d3f27da8b 100644 --- a/recipes/naszdziennik.recipe +++ b/recipes/naszdziennik.recipe @@ -15,7 +15,7 @@ class naszdziennik(BasicNewsRecipe): no_stylesheets = True keep_only_tags =[dict(attrs = {'id' : 'article'})] - + #definiujemy nową funkcje; musi zwracać listę feedów wraz z artykułami def parse_index(self): #adres do parsowania artykułów @@ -28,7 +28,7 @@ class naszdziennik(BasicNewsRecipe): sections = [] #deklaracja pierwszej sekcji jako pusty string section = '' - + #pętla for, która analizuje po kolei każdy tag "news-article" for item in soup.findAll(attrs = {'class' : 'news-article'}) : #w tagu "news-article szukamy pierwszego taga h4" @@ -51,11 +51,11 @@ class naszdziennik(BasicNewsRecipe): #jako tytuł użyty będzie tekst pomiędzy tagami <a> article_title = self.tag_to_string(article_a) #a data będzie tekstem z pierwszego taga h4 znalezionego w tagu title-datetime - article_date = self.tag_to_string(article_title_datetime.find('h4')) + article_date = self.tag_to_string(article_title_datetime.find('h4')) #zebrane elementy dodajemy do listy zadeklarowanej w linijce 44 articles[section].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date }) #po dodaniu wszystkich artykułów dodajemy sekcje do listy feedów, korzystając z list sekcji znajdujących się w słowniku for section in sections: feeds.append((section, articles[section])) #zwracamy listę feedów, której parsowaniem zajmie się calibre - return feeds \ No newline at end of file + return feeds diff --git a/recipes/nauka_w_polsce.recipe b/recipes/nauka_w_polsce.recipe new file mode 100644 index 0000000000..c524c18b26 --- /dev/null +++ b/recipes/nauka_w_polsce.recipe @@ -0,0 +1,47 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class NaukawPolsce(BasicNewsRecipe): + title = u'Nauka w Polsce' + __author__ = 'fenuks' + description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.' + category = 'science' + language = 'pl' + cover_url = 'http://www.naukawpolsce.pap.pl/Themes/Pap/images/logo-pl.gif' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + index = 'http://www.naukawpolsce.pl' + keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})] + remove_tags = [dict(name='div', attrs={'class':'tagi'})] + + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + for i in soup.findAll(name='div', attrs={'class':'aktualnosci-margines lista-depesz information-content'}): + title = i.h1.a.string + url = self.index + i.h1.a['href'] + date = '' #i.span.string + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Historia i kultura", self.find_articles('http://www.naukawpolsce.pl/historia-i-kultura/'))) + feeds.append((u"Kosmos", self.find_articles('http://www.naukawpolsce.pl/kosmos/'))) + feeds.append((u"Przyroda", self.find_articles('http://www.naukawpolsce.pl/przyroda/'))) + feeds.append((u"Społeczeństwo", self.find_articles('http://www.naukawpolsce.pl/spoleczenstwo/'))) + feeds.append((u"Technologie", self.find_articles('http://www.naukawpolsce.pl/technologie/'))) + feeds.append((u"Uczelnie", self.find_articles('http://www.naukawpolsce.pl/uczelnie/'))) + feeds.append((u"Nauki medyczne", self.find_articles('http://www.naukawpolsce.pl/zdrowie/'))) + + return feeds + + def preprocess_html(self, soup): + for p in soup.findAll(name='p', text=re.compile(' ')): + p.extract() + return soup diff --git a/recipes/navegalo.recipe b/recipes/navegalo.recipe new file mode 100644 index 0000000000..89f6cde45d --- /dev/null +++ b/recipes/navegalo.recipe @@ -0,0 +1,40 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1360354988(BasicNewsRecipe): + title = u'Navegalo.com' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + +from calibre.web.feeds.news import BasicNewsRecipe + +class navegalonews(BasicNewsRecipe): + __author__ = 'Douglas Delgado' + title = u'Navegalo.com' + publisher = 'Navegalo.com' + description = 'Noticias actualizadas por Navegalo.com. Recipe creado por Douglas Delgado (doudelgado@gmail.com) para su uso con Calibre' + category = 'Spanish, Entertainment' + masthead_url = 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQZhML5lwsdss6FFF7CFR0Sf-Ln052Zmhs1TlIOcAL8JWN8a-dPlA' + + oldest_article = 2 + delay = 1 + max_articles_per_feed = 100 + auto_cleanup = True + encoding = 'utf-8' + language = 'es_CR' + use_embedded_content = False + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + + feeds = [(u'Nacionales', u'http://www.navegalo.com/es/noticias/noticias/noticias-nacionales?format=feed&type=rss'), (u'Internacionales', u'http://direct.navegalo.com/es/noticias/noticias/noticias-internacionales?format=feed&type=rss'), (u'Deportes', u'http://direct.navegalo.com/es/noticias/noticias/deportes-nacionales?format=feed&type=rss'), (u'Solo futbol', u'http://www.navegalo.com/es/noticias/noticias/solo-futbol?format=feed&type=rss'), (u'Entretenimiento', u'http://www.navegalo.com/es/noticias/noticias/entretenimiento?format=feed&type=rss'), (u'Solo para ellas', u'http://www.navegalo.com/es/noticias/noticias/solo-para-ellas?format=feed&type=rss'), (u'Infiltrados', u'http://direct.navegalo.com/es/noticias/noticias/infiltrados?format=feed&type=rss'), (u'Mano a mano', u'http://direct.navegalo.com/es/noticias/noticias/mano-a-mano?format=feed&type=rss')] + + + + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;} + ''' + diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index c5f1b0aff2..2730b45d6d 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>' +__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>' ''' newyorker.com ''' @@ -44,20 +44,18 @@ class NewYorker(BasicNewsRecipe): , 'language' : language } - keep_only_tags = [ - dict(name='div', attrs={'class':'headers'}) - ,dict(name='div', attrs={'id':['articleheads','items-container','articleRail','articletext','photocredits']}) - ] + keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})] remove_tags = [ dict(name=['meta','iframe','base','link','embed','object']) - ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons'] }) + ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] }) ,dict(attrs={'id':['show-header','show-footer'] }) ] + remove_tags_after = dict(attrs={'class':'entry-content'}) remove_attributes = ['lang'] feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')] def print_version(self, url): - return url + '?printable=true' + return url + '?printable=true¤tPage=all' def image_url_processor(self, baseurl, url): return url.strip() diff --git a/recipes/nezavisne_novine.recipe b/recipes/nezavisne_novine.recipe new file mode 100644 index 0000000000..357c478ce1 --- /dev/null +++ b/recipes/nezavisne_novine.recipe @@ -0,0 +1,59 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>' +''' +www.nezavisne.com +''' +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class NezavisneNovine(BasicNewsRecipe): + title = 'Nezavisne novine' + __author__ = 'Darko Miletic' + description = 'Nezavisne novine - Najnovije vijesti iz BiH, Srbije, Hrvatske, Crne Gore i svijeta' + publisher = 'NIGP "DNN"' + category = 'news, politics, Bosnia, Balcans' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'sr' + remove_empty_feeds = True + publication_type = 'newspaper' + cover_url = strftime('http://pdf.nezavisne.com/slika/novina/nezavisne_novine.jpg?v=%Y%m%d') + masthead_url = 'http://www.nezavisne.com/slika/osnova/nezavisne-novine-logo.gif' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + keep_only_tags = [dict(name='div', attrs={'class':'vijest'})] + remove_tags_after = dict(name='div', attrs={'id':'wrap'}) + remove_tags = [ + dict(name=['meta','link','iframe','object']) + ,dict(name='div', attrs={'id':'wrap'}) + ] + remove_attributes=['lang','xmlns:fb','xmlns:og'] + + + feeds = [ + (u'Novosti' , u'http://feeds.feedburner.com/Novosti-NezavisneNovine' ) + ,(u'Posao' , u'http://feeds.feedburner.com/Posao-NezavisneNovine' ) + ,(u'Sport' , u'http://feeds.feedburner.com/Sport-NezavisneNovine' ) + ,(u'Komentar' , u'http://feeds.feedburner.com/Komentari-NezavisneNovine' ) + ,(u'Umjetnost i zabava' , u'http://feeds.feedburner.com/UmjetnostIZabava-NezavisneNovine' ) + ,(u'Život i stil' , u'http://feeds.feedburner.com/ZivotIStil-NezavisneNovine' ) + ,(u'Auto' , u'http://feeds.feedburner.com/Auto-NezavisneNovine' ) + ,(u'Nauka i tehnologija', u'http://feeds.feedburner.com/NaukaITehnologija-NezavisneNovine') + ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/niebezpiecznik.recipe b/recipes/niebezpiecznik.recipe index b33a0a3513..2228ef5122 100644 --- a/recipes/niebezpiecznik.recipe +++ b/recipes/niebezpiecznik.recipe @@ -3,14 +3,14 @@ from calibre.web.feeds.news import BasicNewsRecipe class Niebezpiecznik_pl(BasicNewsRecipe): title = u'Niebezpiecznik.pl' __author__ = 'fenuks' - description = 'Niebezpiecznik.pl' + description = u'Niebezpiecznik.pl – o bezpieczeństwie i nie...' category = 'hacking, IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets = True - cover_url =u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png' - remove_tags=[dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})] - keep_only_tags= [dict(name='div', attrs={'class':['title', 'entry']})] + cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png' + remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})] + keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})] feeds = [(u'Wiadomości', u'http://feeds.feedburner.com/niebezpiecznik/'), ('Blog', 'http://feeds.feedburner.com/niebezpiecznik/linkblog/')] diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index 7715b9826a..1808d54824 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -9,7 +9,7 @@ class Nowa_Fantastyka(BasicNewsRecipe): __modified_by__ = 'zaslav' language = 'pl' encoding='latin2' - description ='site for fantasy readers' + description = u'Strona dla miłośników fantastyki' category='fantasy' masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg' #extra_css='.tytul {font-size: 20px;}' #not working diff --git a/recipes/nowy_obywatel.recipe b/recipes/nowy_obywatel.recipe new file mode 100644 index 0000000000..66bdfed390 --- /dev/null +++ b/recipes/nowy_obywatel.recipe @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = u'Łukasz Grąbczewski 2013' +__version__ = '1.0' + +''' +nowyobywatel.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class nowyobywatel(BasicNewsRecipe): + __author__ = u'Łukasz Grączewski' + title = u'Nowy Obywatel' + description = u'Pismo na rzecz sprawiedliwości społecznej' + language = 'pl' + publisher = 'Stowarzyszenie „Obywatele Obywatelom”' + publication_type = 'magazine' + masthead_url = 'http://lewicowo.pl/wp-content/uploads/2011/11/nowy-obywatel1.png' + no_stylesheets = True + remove_javascript = True + use_embedded_content = True + remove_empty_feeds = True + + oldest_article = 32 #monthly +1 + max_articles_per_feed = 100 + simultaneous_downloads = 20 + + feeds = [ + (u'Aktualności', u'http://nowyobywatel.pl/category/aktualnosci/feed/'), + (u'Opinie', u'http://nowyobywatel.pl/category/opinie/feed/'), + (u'Nasze rozmowy', u'http://nowyobywatel.pl/category/nasze-rozmowy/feed/'), + (u'Inspiracje', u'http://nowyobywatel.pl/category/inspiracje/feed/') + ] + + + remove_tags = [] + remove_tags.append(dict(attrs = {'class' : 'post-date'})) + remove_tags.append(dict(attrs = {'class' : 'printfriendly'})) + remove_tags.append(dict(attrs = {'class' : 'social4i'})) diff --git a/recipes/nto.recipe b/recipes/nto.recipe new file mode 100644 index 0000000000..ed29db22ba --- /dev/null +++ b/recipes/nto.recipe @@ -0,0 +1,68 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +class NTO(BasicNewsRecipe): + title = u'Nowa Trybuna Opolska' + __author__ = 'fenuks' + description = u'Nowa Trybuna Opolska - portal regionalny województwa opolskiego.' + category = 'newspaper' + language = 'pl' + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + INDEX = 'http://www.nto.pl' + masthead_url = INDEX + '/images/top_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} + + preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(id=['article', 'cover', 'photostory'])] + remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections', + 'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections', + 'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']), + dict(attrs={'class':'articleFunctions'})] + + feeds = [(u'Wszystkie', u'http://www.nto.pl/rss.xml'), (u'Region', u'http://www.nto.pl/region.xml'), (u'Brzeg', u'http://www.nto.pl/brzeg.xml'), (u'G\u0142ubczyce', u'http://www.nto.pl/glubczyce.xml'), (u'K\u0119dzierzyn-Ko\u017ale', u'http://www.nto.pl/kedzierzynkozle.xml'), (u'Kluczbork', u'http://www.nto.pl/kluczbork.xml'), (u'Krapkowice', u'http://www.nto.pl/krapkowice.xml'), (u'Namys\u0142\xf3w', u'http://www.nto.pl/namyslow.xml'), (u'Nysa', u'http://www.nto.pl/nysa.xml'), (u'Olesno', u'http://www.nto.pl/olesno.xml'), (u'Opole', u'http://www.nto.pl/opole.xml'), (u'Prudnik', u'http://www.nto.pl/prudnik.xml'), (u'Strzelce Opolskie', u'http://www.nto.pl/strzelceopolskie.xml'), (u'Sport', u'http://www.nto.pl/sport.xml'), (u'Polska i \u015bwiat', u'http://www.nto.pl/apps/pbcs.dll/section?Category=RSS&channel=KRAJSWIAT'), (u'Zdrowy styl', u'http://www.nto.pl/apps/pbcs.dll/section?Category=rss_zdrowystyl'), (u'Reporta\u017c', u'http://www.nto.pl/reportaz.xml'), (u'Studia', u'http://www.nto.pl/akademicka.xml')] + + def get_cover_url(self): + soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI') + nexturl = self.INDEX + soup.find(id='covers').find('a')['href'] + soup = self.index_to_soup(nexturl) + self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src'] + return getattr(self, 'cover_url', self.cover_url) + + def append_page(self, soup, appendtag): + tag = soup.find('span', attrs={'class':'photoNavigationPages'}) + if tag: + number = int(tag.string.rpartition('/')[-1].replace(' ', '')) + baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1] + + for r in appendtag.findAll(attrs={'class':'photoNavigation'}): + r.extract() + for nr in range(2, number+1): + soup2 = self.index_to_soup(baseurl + str(nr)) + pagetext = soup2.find(id='photoContainer') + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoMeta'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'photoStoryText'}) + if pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index d0f311818e..c4a4b3cee5 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -41,7 +41,7 @@ class NYTimes(BasicNewsRecipe): # number of days old an article can be for inclusion. If oldest_web_article = None all articles # will be included. Note: oldest_web_article is ignored if webEdition = False webEdition = False - oldest_web_article = 7 + oldest_web_article = None # download higher resolution images than the small thumbnails typically included in the article # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper @@ -188,6 +188,8 @@ class NYTimes(BasicNewsRecipe): 'relatedSearchesModule', 'side_tool', 'singleAd', + 'postCategory column', + 'refer tagRefer', # added for bits blog post 'entry entry-utility', #added for DealBook 'entry-tags', #added for DealBook 'footer promos clearfix', #added for DealBook @@ -324,6 +326,8 @@ class NYTimes(BasicNewsRecipe): return True if '/video/' in url: return True + if '/multimedia/' in url: + return True if '/slideshow/' in url: return True if '/magazine/index' in url: @@ -334,6 +338,15 @@ class NYTimes(BasicNewsRecipe): return True if '/premium/' in url: return True + if '#comment' in url: + return True + if '#postComment' in url: + return True + if '#postcomment' in url: + return True + if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None: + print("NO DATE IN "+url) + return True return False def fixChars(self,string): @@ -363,6 +376,7 @@ class NYTimes(BasicNewsRecipe): cover_tag = 'NY_NYT' def get_cover_url(self): + from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' br = BasicNewsRecipe.get_browser(self) daysback=1 @@ -385,7 +399,6 @@ class NYTimes(BasicNewsRecipe): masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - def short_title(self): return self.title @@ -647,75 +660,53 @@ class NYTimes(BasicNewsRecipe): soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') - # Fetch the content table - content_table = soup.find('table',{'id':'content'}) - if content_table is None: - self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE") - return None - - # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections - - for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): - for div_sec in td_col.findAll('div',recursive=False): - for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): - - section_name = self.tag_to_string(h6_sec_name,use_alt=False) - section_name = re.sub(r'^ *$','',section_name) - - if section_name == '': + section_name='Unknown Section' + pubdate = strftime('%a, %d %b') + for td_col in soup.findAll('td'): + h6_sec_name = td_col.find('h6') + if h6_sec_name is not None: + new_section_name = self.tag_to_string(h6_sec_name,use_alt=False) + new_section_name = re.sub(r'^ *$','',new_section_name) + if new_section_name == '': + continue + section_name = new_section_name + continue + atag = td_col.find('a') + if atag is not None: + h4tag = None + for h4tag in atag.findNextSiblings('h4'): + break + if h4tag is None: + continue + author = self.tag_to_string(h4tag,use_alt=False) + try: + url = re.sub(r'\?.*', '', atag['href']) + except: + continue + if self.exclude_url(url): + continue + if '?' in url: + url += '&pagewanted=all' + else: + url += '?pagewanted=all' + if self.filterDuplicates: + if url in self.url_list: continue - if self.includeSections != []: - if section_name not in self.includeSections: - print "SECTION NOT INCLUDED: ",section_name - continue - if section_name in self.excludeSections: - print "SECTION EXCLUDED: ",section_name - continue - - section_name=string.capwords(section_name) - section_name = section_name.replace('Op-ed','Op-Ed') - section_name = section_name.replace('U.s.','U.S.') - section_name = section_name.replace('N.y.','N.Y.') - pubdate = strftime('%a, %d %b') - - search_div = div_sec - for next_tag in h6_sec_name.findNextSiblings(True): - if next_tag.__class__.__name__ == 'Tag': - if next_tag.name == 'div': - search_div = next_tag - break - - # Get the articles - for h3_item in search_div.findAll('h3'): - byline = h3_item.h6 - if byline is not None: - author = self.tag_to_string(byline,use_alt=False) - else: - author = '' - a = h3_item.find('a', href=True) - if not a: - continue - url = re.sub(r'\?.*', '', a['href']) - if self.exclude_url(url): - continue - url += '?pagewanted=all' - if self.filterDuplicates: - if url in self.url_list: - continue - self.url_list.append(url) - title = self.tag_to_string(a, use_alt=True).strip() - desc = h3_item.find('p') - if desc is not None: - description = self.tag_to_string(desc,use_alt=False) - else: - description = '' - if not self.articles.has_key(section_name): - self.ans.append(section_name) - self.articles[section_name] = [] - self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) + self.url_list.append(url) + title = self.tag_to_string(atag, use_alt=False).strip() + desc = atag.parent.find('p') + if desc is not None: + description = self.tag_to_string(desc,use_alt=False) + else: + description = '' + if not self.articles.has_key(section_name): + self.ans.append(section_name) + self.articles[section_name] = [] + print('Title '+title+' author '+author) + self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] - return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) + return self.filter_ans(self.ans) def parse_index(self): if self.headlinesOnly: @@ -825,8 +816,9 @@ class NYTimes(BasicNewsRecipe): for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}): if divr.find(text=re.compile('Sign up')): divr.extract() - divr = soup.find('div',attrs={'id':re.compile('related-content')}) + divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')}) if divr is not None: + print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False)) # handle related articles rlist = [] ul = divr.find('ul') @@ -856,6 +848,8 @@ class NYTimes(BasicNewsRecipe): asidediv.append(Tag(soup,'hr')) smain = soup.find('body') smain.append(asidediv) + else: + print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False)) for atag in soup.findAll('a'): img = atag.find('img') if img is not None: @@ -898,6 +892,18 @@ class NYTimes(BasicNewsRecipe): first_outer = outerdiv else: litag.extract() + for h6tag in rdiv.findAll('h6'): + if h6tag.find('a') is not None: + if h6tag.find('a')['href'].startswith('http://www.nytimes.com'): + url = re.sub(r'\?.*', '', h6tag.find('a')['href']) + h6tag.find('a')['href'] = url+'?pagewanted=all' + h6tag.extract() + related.append(h6tag) + if first_related is None: + first_related = rdiv + first_outer = outerdiv + else: + h6tag.extract() if related != []: for r in related: if r.h6: # don't want the anchor inside a h6 tag diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 06c476ef19..2dba2d505d 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -188,6 +188,8 @@ class NYTimes(BasicNewsRecipe): 'relatedSearchesModule', 'side_tool', 'singleAd', + 'postCategory column', + 'refer tagRefer', # added for bits blog post 'entry entry-utility', #added for DealBook 'entry-tags', #added for DealBook 'footer promos clearfix', #added for DealBook @@ -324,6 +326,8 @@ class NYTimes(BasicNewsRecipe): return True if '/video/' in url: return True + if '/multimedia/' in url: + return True if '/slideshow/' in url: return True if '/magazine/index' in url: @@ -334,6 +338,15 @@ class NYTimes(BasicNewsRecipe): return True if '/premium/' in url: return True + if '#comment' in url: + return True + if '#postComment' in url: + return True + if '#postcomment' in url: + return True + if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None: + print("NO DATE IN "+url) + return True return False def fixChars(self,string): @@ -371,6 +384,7 @@ class NYTimes(BasicNewsRecipe): cover_tag = 'NY_NYT' def get_cover_url(self): + from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' br = BasicNewsRecipe.get_browser(self) daysback=1 @@ -393,7 +407,6 @@ class NYTimes(BasicNewsRecipe): masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - def short_title(self): return self.title @@ -655,75 +668,53 @@ class NYTimes(BasicNewsRecipe): soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') - # Fetch the content table - content_table = soup.find('table',{'id':'content'}) - if content_table is None: - self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE") - return None - - # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections - - for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): - for div_sec in td_col.findAll('div',recursive=False): - for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): - - section_name = self.tag_to_string(h6_sec_name,use_alt=False) - section_name = re.sub(r'^ *$','',section_name) - - if section_name == '': + section_name='Unknown Section' + pubdate = strftime('%a, %d %b') + for td_col in soup.findAll('td'): + h6_sec_name = td_col.find('h6') + if h6_sec_name is not None: + new_section_name = self.tag_to_string(h6_sec_name,use_alt=False) + new_section_name = re.sub(r'^ *$','',new_section_name) + if new_section_name == '': + continue + section_name = new_section_name + continue + atag = td_col.find('a') + if atag is not None: + h4tag = None + for h4tag in atag.findNextSiblings('h4'): + break + if h4tag is None: + continue + author = self.tag_to_string(h4tag,use_alt=False) + try: + url = re.sub(r'\?.*', '', atag['href']) + except: + continue + if self.exclude_url(url): + continue + if '?' in url: + url += '&pagewanted=all' + else: + url += '?pagewanted=all' + if self.filterDuplicates: + if url in self.url_list: continue - if self.includeSections != []: - if section_name not in self.includeSections: - print "SECTION NOT INCLUDED: ",section_name - continue - if section_name in self.excludeSections: - print "SECTION EXCLUDED: ",section_name - continue - - section_name=string.capwords(section_name) - section_name = section_name.replace('Op-ed','Op-Ed') - section_name = section_name.replace('U.s.','U.S.') - section_name = section_name.replace('N.y.','N.Y.') - pubdate = strftime('%a, %d %b') - - search_div = div_sec - for next_tag in h6_sec_name.findNextSiblings(True): - if next_tag.__class__.__name__ == 'Tag': - if next_tag.name == 'div': - search_div = next_tag - break - - # Get the articles - for h3_item in search_div.findAll('h3'): - byline = h3_item.h6 - if byline is not None: - author = self.tag_to_string(byline,use_alt=False) - else: - author = '' - a = h3_item.find('a', href=True) - if not a: - continue - url = re.sub(r'\?.*', '', a['href']) - if self.exclude_url(url): - continue - url += '?pagewanted=all' - if self.filterDuplicates: - if url in self.url_list: - continue - self.url_list.append(url) - title = self.tag_to_string(a, use_alt=True).strip() - desc = h3_item.find('p') - if desc is not None: - description = self.tag_to_string(desc,use_alt=False) - else: - description = '' - if not self.articles.has_key(section_name): - self.ans.append(section_name) - self.articles[section_name] = [] - self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) + self.url_list.append(url) + title = self.tag_to_string(atag, use_alt=False).strip() + desc = atag.parent.find('p') + if desc is not None: + description = self.tag_to_string(desc,use_alt=False) + else: + description = '' + if not self.articles.has_key(section_name): + self.ans.append(section_name) + self.articles[section_name] = [] + print('Title '+title+' author '+author) + self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] - return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) + return self.filter_ans(self.ans) def parse_index(self): if self.headlinesOnly: @@ -833,8 +824,9 @@ class NYTimes(BasicNewsRecipe): for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}): if divr.find(text=re.compile('Sign up')): divr.extract() - divr = soup.find('div',attrs={'id':re.compile('related-content')}) + divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')}) if divr is not None: + print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False)) # handle related articles rlist = [] ul = divr.find('ul') @@ -864,6 +856,8 @@ class NYTimes(BasicNewsRecipe): asidediv.append(Tag(soup,'hr')) smain = soup.find('body') smain.append(asidediv) + else: + print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False)) for atag in soup.findAll('a'): img = atag.find('img') if img is not None: @@ -906,6 +900,18 @@ class NYTimes(BasicNewsRecipe): first_outer = outerdiv else: litag.extract() + for h6tag in rdiv.findAll('h6'): + if h6tag.find('a') is not None: + if h6tag.find('a')['href'].startswith('http://www.nytimes.com'): + url = re.sub(r'\?.*', '', h6tag.find('a')['href']) + h6tag.find('a')['href'] = url+'?pagewanted=all' + h6tag.extract() + related.append(h6tag) + if first_related is None: + first_related = rdiv + first_outer = outerdiv + else: + h6tag.extract() if related != []: for r in related: if r.h6: # don't want the anchor inside a h6 tag diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index 5388da9dcb..2d8fb69a7e 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -1,21 +1,59 @@ + from calibre.web.feeds.news import BasicNewsRecipe class NewYorkTimesBookReview(BasicNewsRecipe): title = u'New York Times Book Review' language = 'en' - __author__ = 'Krittika Goyal' - oldest_article = 8 #days - max_articles_per_feed = 1000 - #recursions = 2 - #encoding = 'latin1' - use_embedded_content = False + description = 'The New York Times Sunday Book Review. Best downloaded on Fridays to avoid the ads that the New York Times shows of the first few days of the week.' + __author__ = 'Kovid Goyal' + no_stylesheets = True - auto_cleanup = True + no_javascript = True + keep_only_tags = [dict(id='article'), dict(id=lambda x:x and x.startswith('entry-'))] + remove_tags = [ + dict(attrs={'class':['articleBottomExtra', 'shareToolsBox', 'singleAd']}), + dict(attrs={'class':lambda x: x and ('shareTools' in x or 'enlargeThis' in x)}), + ] + + def parse_index(self): + soup = self.index_to_soup('http://www.nytimes.com/pages/books/review/index.html') + + # Find TOC + toc = soup.find('div', id='main').find( + 'div', attrs={'class':'abColumn'}) + feeds = [] + articles = [] + section_title = 'Features' + for x in toc.findAll(['div', 'h3', 'h6'], attrs={'class':['story', 'sectionHeader', 'ledeStory']}): + if x['class'] == 'sectionHeader': + if articles: + feeds.append((section_title, articles)) + section_title = self.tag_to_string(x) + articles = [] + self.log('Found section:', section_title) + continue + if x['class'] in {'story', 'ledeStory'}: + tt = 'h3' if x['class'] == 'story' else 'h1' + try: + a = x.find(tt).find('a', href=True) + except AttributeError: + continue + title = self.tag_to_string(a) + url = a['href'] + '&pagewanted=all' + self.log('\tFound article:', title, url) + desc = '' + byline = x.find('h6', attrs={'class':'byline'}) + if byline is not None: + desc = self.tag_to_string(byline) + summary = x.find('p', attrs={'class':'summary'}) + if summary is not None: + desc += self.tag_to_string(summary) + if desc: + self.log('\t\t', desc) + articles.append({'title':title, 'url':url, 'date':'', + 'description':desc}) + + return feeds - feeds = [ - ('New York Times Sunday Book Review', - 'http://feeds.nytimes.com/nyt/rss/SundayBookReview'), - ] - diff --git a/recipes/oclab_pl.recipe b/recipes/oclab_pl.recipe index c00ec0d61c..240d8f91f6 100644 --- a/recipes/oclab_pl.recipe +++ b/recipes/oclab_pl.recipe @@ -1,4 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + class OCLab(BasicNewsRecipe): title = u'OCLab.pl' oldest_article = 7 @@ -26,6 +28,10 @@ class OCLab(BasicNewsRecipe): appendtag.insert(pos, pagetext) for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}): r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup diff --git a/recipes/osworld_pl.recipe b/recipes/osworld_pl.recipe new file mode 100644 index 0000000000..7784a271e0 --- /dev/null +++ b/recipes/osworld_pl.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OSWorld(BasicNewsRecipe): + title = u'OSWorld.pl' + __author__ = 'fenuks' + description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!' + category = 'OS, IT, open source, Linux' + language = 'pl' + cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id=['dzial', 'posts'])] + remove_tags = [dict(attrs={'class':'post-comments'})] + remove_tags_after = dict(attrs={'class':'entry clr'}) + feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')] + + def append_page(self, soup, appendtag): + tag = appendtag.find(attrs={'id':'paginacja'}) + if tag: + for nexturl in tag.findAll('a'): + soup2 = self.index_to_soup(nexturl['href']) + pagetext = soup2.find(attrs={'class':'entry clr'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'id':'paginacja'}): + r.extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe index 953dee67eb..fd814a456c 100644 --- a/recipes/overclock_pl.recipe +++ b/recipes/overclock_pl.recipe @@ -1,5 +1,6 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class Overclock_pl(BasicNewsRecipe): title = u'Overclock.pl' oldest_article = 7 @@ -21,4 +22,4 @@ class Overclock_pl(BasicNewsRecipe): if 'articles/show' in url: return url.replace('show', 'showall') else: - return url \ No newline at end of file + return url diff --git a/recipes/pc_foster.recipe b/recipes/pc_foster.recipe index ab8c2b66b1..f9c7eea1e0 100644 --- a/recipes/pc_foster.recipe +++ b/recipes/pc_foster.recipe @@ -1,4 +1,8 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +#currently recipe is not working + class PC_Foster(BasicNewsRecipe): title = u'PC Foster' oldest_article = 7 @@ -7,12 +11,12 @@ class PC_Foster(BasicNewsRecipe): description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.' category = 'IT' language = 'pl' - masthead_url='http://pcfoster.pl/public/images/logo.png' - cover_url= 'http://pcfoster.pl/public/images/logo.png' - no_stylesheets= True - remove_empty_feeds= True - keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})] - remove_tags=[dict(name='p', attrs={'class':'right'})] + masthead_url = 'http://pcfoster.pl/public/images/logo.png' + cover_url = 'http://pcfoster.pl/public/images/logo.png' + no_stylesheets = True + remove_empty_feeds = True + keep_only_tags = [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})] + remove_tags = [dict(name='p', attrs={'class':'right'})] feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')] @@ -29,7 +33,10 @@ class PC_Foster(BasicNewsRecipe): appendtag.insert(pos, pagetext) for r in appendtag.findAll(attrs={'class':'review_content double'}): r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) - return soup \ No newline at end of file + return soup diff --git a/recipes/pc_lab.recipe b/recipes/pc_lab.recipe index c4b33b8416..7a6038bd65 100644 --- a/recipes/pc_lab.recipe +++ b/recipes/pc_lab.recipe @@ -1,5 +1,4 @@ #!/usr/bin/env python - from calibre.web.feeds.recipes import BasicNewsRecipe class PCLab(BasicNewsRecipe): @@ -8,12 +7,13 @@ class PCLab(BasicNewsRecipe): __author__ = 'ravcio - rlelusz[at]gmail.com' description = u"Articles from PC Lab website" language = 'pl' - oldest_article = 30.0 + oldest_article = 30 max_articles_per_feed = 100 recursions = 0 encoding = 'iso-8859-2' no_stylesheets = True remove_javascript = True + remove_empty_feeds = True use_embedded_content = False keep_only_tags = [ @@ -21,50 +21,54 @@ class PCLab(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['chapters']}) - ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']}) + dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']}) ] - remove_tags_after = [ - dict(name='div', attrs={'class':['navigation']}) - ] - #links to RSS feeds - feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ] + feeds = [ + (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'), + (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'), + (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml') + ] #load second and subsequent page content # in: soup - full page with 'next' button # out: appendtag - tag to which new page is to be added def append_page(self, soup, appendtag): # find the 'Next' button - pager = soup.find('div', attrs={'class':'next'}) - + pager = soup.find('div', attrs={'class':'navigation'}) if pager: + a = pager.find('a') + if 'news' in a['href']: + pager = None + else: + pager = pager.find('div', attrs={'class':'next'}) + + while pager: #search for 'a' element with link to next page (exit if not found) a = pager.find('a') - if a: - nexturl = a['href'] + nexturl = a['href'] + soup2 = self.index_to_soup('http://pclab.pl' + nexturl) + pager = soup2.find('div', attrs={'class':'next'}) + pagetext = soup2.find('div', attrs={'class':'substance'}) + pagetext = pagetext.find('div', attrs={'class':'data'}) - soup2 = self.index_to_soup('http://pclab.pl/' + nexturl) - - pagetext_substance = soup2.find('div', attrs={'class':'substance'}) - pagetext = pagetext_substance.find('div', attrs={'class':'data'}) - pagetext.extract() - - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - pos = len(appendtag.contents) - - self.append_page(soup2, appendtag) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pos = len(appendtag.contents) + pager = soup.find('div', attrs={'class':'navigation'}) + if pager: + pager.extract() def preprocess_html(self, soup): - # soup.body contains no title and no navigator, they are in soup self.append_page(soup, soup.body) - + for link in soup.findAll('a'): + href = link.get('href', None) + if href and href.startswith('/'): + link['href'] = 'http://pclab.pl' + href # finally remove some tags - tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) - [tag.extract() for tag in tags] + #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) return soup diff --git a/recipes/pnn.recipe b/recipes/pnn.recipe new file mode 100644 index 0000000000..cb36afe88b --- /dev/null +++ b/recipes/pnn.recipe @@ -0,0 +1,55 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +'''Calibre recipe to convert the RSS feeds of the PNN to an ebook.''' + +class SportsIllustratedRecipe(BasicNewsRecipe) : + __author__ = 'n.kucklaender' + __copyright__ = 'a.peter' + __license__ = 'GPL v3' + language = 'de' + description = 'PNN RSS' + version = 1 + title = u'PNN' + timefmt = ' [%d.%m.%Y]' + + oldest_article = 7.0 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + publication_type = 'newspaper' + + remove_empty_feeds = True + remove_tags = [dict(attrs={'class':['um-weather um-header-weather','um-has-sub um-mainnav','um-box','ts-products','um-meta-nav','um-box um-last','um-footer','um-footer-links','share hidden','um-buttons']}),dict(id=['dinsContainer'])] + # remove_tags_before = [dict(name='div', attrs={'class':'um-first'})] + # remove_tags_after = [dict(name='div', attrs={'class':'um-metabar'})] + + feeds = [(u'Titelseite', u'http://www.pnn.de/rss.xml'), + (u'Dritte Seite', u'http://www.pnn.de/dritte-seite/rss.xml'), + (u'Politik', u'http://www.pnn.de/politik/rss.xml'), + (u'Meinung', u'http://www.pnn.de/meinung/rss.xml'), + (u'Potsdam', u'http://www.pnn.de/potsdam/rss.xml'), + (u'Havel-Spree', u'http://www.pnn.de/havel-spree/rss.xml'), + (u'Potsdam-Mittelmark', u'http://www.pnn.de/pm/rss.xml'), + (u'Berlin-Brandenburg', u'http://www.pnn.de/brandenburg-berlin/rss.xml'), + (u'Wirtschaft', u'http://www.pnn.de/wirtschaft/rss.xml'), + (u'Sport', u'http://www.pnn.de/sport/rss.xml'), + (u'Regionalsport', u'http://www.pnn.de/regionalsport/rss.xml'), + (u'Kultur', u'http://www.pnn.de/kultur/rss.xml'), + (u'Potsdam-Kultur', u'http://www.pnn.de/potsdam-kultur/rss.xml'), + (u'Wissen', u'http://www.pnn.de/wissen/rss.xml'), + (u'Medien', u'http://www.pnn.de/medien/rss.xml'), + (u'Weltspiegel', u'http://www.pnn.de/weltspiegel/rss.xml'), + (u'Wissenschaft', u'http://www.pnn.de/campus/rss.xml'), + (u'Mobil', u'http://www.pnn.de/mobil/rss.xml'), + (u'Reise', u'http://www.pnn.de/reise/rss.xml'), + (u'Ratgeber', u'http://www.pnn.de/ratgeber/rss.xml'), + (u'Fragen des Tages', u'http://www.pnn.de/fragen-des-tages/rss.xml'), + # (u'Potsdam bin ich', u'http://www.pnn.de/potsdam-bin-ich/rss.xml'), + (u'Leserbriefe', u'http://www.pnn.de/leserbriefe/rss.xml')] + + def get_masthead_url(self): + return 'http://www.pnn.de/app/base/img/pnn_logo.png' + + def print_version(self, url): + return url.replace('.html', ',view,printVersion.html') + diff --git a/recipes/polska_times.recipe b/recipes/polska_times.recipe index 21104f1299..6fd6734c8c 100644 --- a/recipes/polska_times.recipe +++ b/recipes/polska_times.recipe @@ -7,9 +7,11 @@ class PolskaTimes(BasicNewsRecipe): language = 'pl' masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17' oldest_article = 7 + encoding = 'iso-8859-2' max_articles_per_feed = 100 - remove_emty_feeds= True + remove_empty_feeds = True no_stylesheets = True + use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} #preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ] remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) diff --git a/recipes/pravda_rs.recipe b/recipes/pravda_rs.recipe new file mode 100644 index 0000000000..742527ac2b --- /dev/null +++ b/recipes/pravda_rs.recipe @@ -0,0 +1,85 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>' + +''' +www.pravda.rs +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Pravda_rs(BasicNewsRecipe): + title = 'Dnevne novine Pravda' + __author__ = 'Darko Miletic' + description = '24 sata portal vesti iz Srbije' + publisher = 'Dnevne novine Pravda' + category = 'news, politics, entertainment, Serbia' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'sr' + publication_type = 'newspaper' + remove_empty_feeds = True + PREFIX = 'http://www.pravda.rs' + FEEDPR = PREFIX + '/category/' + LANGLAT = '?lng=lat' + FEEDSU = '/feed/' + LANGLAT + INDEX = PREFIX + LANGLAT + masthead_url = 'http://www.pravda.rs/wp-content/uploads/2012/09/logoof.png' + extra_css = """ + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + body{font-family: Georgia,"Times New Roman",Times,serif1,serif;} + img{display: block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + keep_only_tags = [dict(name='div', attrs={'class':'post'})] + remove_tags = [dict(name='h3')] + remove_tags_after = dict(name='h3') + + feeds = [ + (u'Politika' , FEEDPR + 'politika/' + FEEDSU), + (u'Tema Dana', FEEDPR + 'tema-dana/' + FEEDSU), + (u'Hronika' , FEEDPR + 'hronika/' + FEEDSU), + (u'Društvo' , FEEDPR + 'drustvo/' + FEEDSU), + (u'Ekonomija', FEEDPR + 'ekonomija/' + FEEDSU), + (u'Srbija' , FEEDPR + 'srbija/' + FEEDSU), + (u'Beograd' , FEEDPR + 'beograd/' + FEEDSU), + (u'Kultura' , FEEDPR + 'kultura/' + FEEDSU), + (u'Zabava' , FEEDPR + 'zabava/' + FEEDSU), + (u'Sport' , FEEDPR + 'sport/' + FEEDSU), + (u'Svet' , FEEDPR + 'svet/' + FEEDSU), + (u'Porodica' , FEEDPR + 'porodica/' + FEEDSU), + (u'Vremeplov', FEEDPR + 'vremeplov/' + FEEDSU), + (u'IT' , FEEDPR + 'it/' + FEEDSU), + (u'Republika Srpska', FEEDPR + 'republika-srpska/' + FEEDSU), + (u'Crna Gora', FEEDPR + 'crna-gora/' + FEEDSU), + (u'EX YU' , FEEDPR + 'eks-ju/' + FEEDSU), + (u'Dijaspora', FEEDPR + 'dijaspora/' + FEEDSU), + (u'Kolumna' , FEEDPR + 'kolumna/' + FEEDSU), + (u'Afere' , FEEDPR + 'afere/' + FEEDSU), + (u'Feljton' , FEEDPR + 'feljton/' + FEEDSU), + (u'Intervju' , FEEDPR + 'intervju/' + FEEDSU), + (u'Reportaža', FEEDPR + 'reportaza/' + FEEDSU), + (u'Zanimljivosti', FEEDPR + 'zanimljivosti/' + FEEDSU), + (u'Sa trga' , FEEDPR + 'sa-trga/' + FEEDSU) + ] + + def print_version(self, url): + return url + self.LANGLAT + + def preprocess_raw_html(self, raw, url): + return '<html><head><title>title'+raw[raw.find(''):] + \ No newline at end of file diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe index 7a6c43bb7e..13d9307a09 100644 --- a/recipes/pure_pc.recipe +++ b/recipes/pure_pc.recipe @@ -1,4 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + class PurePC(BasicNewsRecipe): title = u'PurePC' oldest_article = 7 @@ -27,7 +29,10 @@ class PurePC(BasicNewsRecipe): appendtag.insert(pos, pagetext) for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}): r.extract() + comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) - return soup \ No newline at end of file + return soup diff --git a/recipes/revista_cromos.recipe b/recipes/revista_cromos.recipe new file mode 100644 index 0000000000..29515971dd --- /dev/null +++ b/recipes/revista_cromos.recipe @@ -0,0 +1,33 @@ +# coding=utf-8 +# https://github.com/iemejia/calibrecolombia + +''' +http://www.cromos.com.co/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ElMalpensante(BasicNewsRecipe): + title = u'Revista Cromos' + language = 'es_CO' + __author__ = 'Ismael Mejia ' + cover_url = 'http://www.cromos.com.co/sites/cromos.com.co/themes/cromos_theme/images/logo_morado.gif' + description = 'Revista Cromos' + oldest_article = 7 + simultaneous_downloads = 20 + #tags = 'news, sport, blog' + use_embedded_content = True + remove_empty_feeds = True + max_articles_per_feed = 100 + feeds = [(u'Cromos', u'http://www.cromos.com.co/rss.xml'), + (u'Moda', u'http://www.cromos.com.co/moda/feed'), + (u'Estilo de Vida', u'http://www.cromos.com.co/estilo-de-vida/feed'), + (u'Cuidado Personal', u'http://www.cromos.com.co/estilo-de-vida/cuidado-personal/feed'), + (u'Salud y Alimentación', u'http://www.cromos.com.co/estilo-de-vida/salud-y-alimentacion/feed'), + (u'Personajes', u'http://www.cromos.com.co/personajes/feed'), + (u'Actualidad', u'http://www.cromos.com.co/personajes/actualidad/feed'), + (u'Espectáculo', u'http://www.cromos.com.co/personajes/espectaculo/feed'), + (u'Reportajes', u'http://www.cromos.com.co/reportajes/feed'), + (u'Eventos', u'http://www.cromos.com.co/eventos/feed'), + (u'Modelos', u'http://www.cromos.com.co/modelos/feed'), + ] diff --git a/recipes/rmf24_opinie.recipe b/recipes/rmf24_opinie.recipe index 9e4d336252..0bbe5d03a4 100644 --- a/recipes/rmf24_opinie.recipe +++ b/recipes/rmf24_opinie.recipe @@ -46,7 +46,7 @@ class RMF24_opinie(BasicNewsRecipe): return link preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'

Zdj.cie

', lambda match: ''), (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'), diff --git a/recipes/rynek_kolejowy.recipe b/recipes/rynek_kolejowy.recipe index f68b33f84b..181bf5b6d3 100644 --- a/recipes/rynek_kolejowy.recipe +++ b/recipes/rynek_kolejowy.recipe @@ -37,4 +37,3 @@ class rynek_kolejowy(BasicNewsRecipe): segment = url.split('/') urlPart = segment[3] return 'http://www.rynek-kolejowy.pl/drukuj.php?id=' + urlPart - diff --git a/recipes/rzeczpospolita.recipe b/recipes/rzeczpospolita.recipe index 40cb4db3ac..d1453ab57e 100644 --- a/recipes/rzeczpospolita.recipe +++ b/recipes/rzeczpospolita.recipe @@ -70,5 +70,3 @@ class RzeczpospolitaRecipe(BasicNewsRecipe): forget, sep, index = rest.rpartition(',') return start + '/' + index + '?print=tak' - - diff --git a/recipes/samcik_blox.recipe b/recipes/samcik_blox.recipe index 69bb836c76..0db6b7e8af 100644 --- a/recipes/samcik_blox.recipe +++ b/recipes/samcik_blox.recipe @@ -22,5 +22,5 @@ class samcik(BasicNewsRecipe): remove_tags =[] remove_tags.append(dict(name = 'table', attrs = {'border' : '0'})) - + feeds = [(u'Wpisy', u'http://samcik.blox.pl/rss2')] diff --git a/recipes/satkurier.recipe b/recipes/satkurier.recipe index 382f7f8180..6c7520f47b 100644 --- a/recipes/satkurier.recipe +++ b/recipes/satkurier.recipe @@ -8,10 +8,7 @@ class SATKurier(BasicNewsRecipe): title = u'SATKurier.pl' __author__ = 'Artur Stachecki ' language = 'pl' - description = u'Największy i najstarszy serwis poświęcony\ - telewizji cyfrowej, przygotowywany przez wydawcę\ - miesięcznika SAT Kurier. Bieżące wydarzenia\ - z rynku mediów i nowych technologii.' + description = u'Serwis poświęcony telewizji cyfrowej' oldest_article = 7 masthead_url = 'http://satkurier.pl/img/header_sk_logo.gif' max_articles_per_feed = 100 diff --git a/recipes/science_news.recipe b/recipes/science_news.recipe index fa24bbadcf..53b451030a 100644 --- a/recipes/science_news.recipe +++ b/recipes/science_news.recipe @@ -1,24 +1,38 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' ''' sciencenews.org ''' from calibre.web.feeds.news import BasicNewsRecipe -class Sciencenews(BasicNewsRecipe): - title = u'ScienceNews' - __author__ = u'Darko Miletic and Sujata Raman' - description = u"Science News is an award-winning weekly newsmagazine covering the most important research in all fields of science. Its 16 pages each week are packed with short, accurate articles that appeal to both general readers and scientists. Published since 1922, the magazine now reaches about 150,000 subscribers and more than 1 million readers. These are the latest News Items from Science News." +class ScienceNewsIssue(BasicNewsRecipe): + title = u'Science News Recent Issues' + __author__ = u'Darko Miletic, Sujata Raman and Starson17' + description = u'''Science News is an award-winning weekly + newsmagazine covering the most important research in all fields of science. + Its 16 pages each week are packed with short, accurate articles that appeal + to both general readers and scientists. Published since 1922, the magazine + now reaches about 150,000 subscribers and more than 1 million readers. + These are the latest News Items from Science News. This recipe downloads + the last 30 days worth of articles.''' + category = u'Science, Technology, News' + publisher = u'Society for Science & the Public' oldest_article = 30 language = 'en' - max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - auto_cleanup = True timefmt = ' [%A, %d %B, %Y]' + recursions = 1 + remove_attributes = ['style'] + + conversion_options = {'linearize_tables' : True + , 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } extra_css = ''' .content_description{font-family:georgia ;font-size:x-large; color:#646464 ; font-weight:bold;} @@ -27,36 +41,33 @@ class Sciencenews(BasicNewsRecipe): .content_edition{font-family:helvetica,arial ;font-size: xx-small ;} .exclusive{color:#FF0000 ;} .anonymous{color:#14487E ;} - .content_content{font-family:helvetica,arial ;font-size: x-small ; color:#000000;} - .description{color:#585858;font-family:helvetica,arial ;font-size: xx-small ;} + .content_content{font-family:helvetica,arial ;font-size: medium ; color:#000000;} + .description{color:#585858;font-family:helvetica,arial ;font-size: large ;} .credit{color:#A6A6A6;font-family:helvetica,arial ;font-size: xx-small ;} ''' - #keep_only_tags = [ dict(name='div', attrs={'id':'column_action'}) ] - #remove_tags_after = dict(name='ul', attrs={'id':'content_functions_bottom'}) - #remove_tags = [ - #dict(name='ul', attrs={'id':'content_functions_bottom'}) - #,dict(name='div', attrs={'id':['content_functions_top','breadcrumb_content']}) - #,dict(name='img', attrs={'class':'icon'}) - #,dict(name='div', attrs={'class': 'embiggen'}) - #] + keep_only_tags = [ dict(name='div', attrs={'class':'content_content'}), + dict(name='ul', attrs={'id':'toc'}) + ] - feeds = [(u"Science News / News Items", u'http://sciencenews.org/index.php/feed/type/news/name/news.rss/view/feed/name/all.rss')] + feeds = [(u"Science News Current Issues", u'http://www.sciencenews.org/view/feed/type/edition/name/issues.rss')] + + match_regexps = [ + r'www.sciencenews.org/view/feature/id/', + r'www.sciencenews.org/view/generic/id' + ] def get_cover_url(self): cover_url = None index = 'http://www.sciencenews.org/view/home' soup = self.index_to_soup(index) link_item = soup.find(name = 'img',alt = "issue") - print link_item if link_item: cover_url = 'http://www.sciencenews.org' + link_item['src'] + '.jpg' return cover_url - #def preprocess_html(self, soup): - - #for tag in soup.findAll(name=['span']): - #tag.name = 'div' - - #return soup + def preprocess_html(self, soup): + for tag in soup.findAll(name=['span']): + tag.name = 'div' + return soup diff --git a/recipes/smith.recipe b/recipes/smith.recipe index cd0c94ab35..db45e42fc7 100644 --- a/recipes/smith.recipe +++ b/recipes/smith.recipe @@ -25,7 +25,7 @@ class Smithsonian(BasicNewsRecipe): soup = self.index_to_soup(current_issue_url) #Go to the main body - div = soup.find ('div', attrs={'id':'content-inset'}) + div = soup.find ('div', attrs={'id':'article-body'}) #Find date date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip()) @@ -36,35 +36,29 @@ class Smithsonian(BasicNewsRecipe): feeds = OrderedDict() section_title = '' - subsection_title = '' + articles = [] for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}): - articles = [] - prefix = '' - h3=post.find('h3') - if h3 is not None: - section_title = self.tag_to_string(h3) + h4=post.find('h3') + if h4 is not None: + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + section_title = self.tag_to_string(h4) + articles = [] + self.log('Found section:', section_title) else: - subsection=post.find('p',attrs={'class':'article-cat'}) link=post.find('a',href=True) url=link['href']+'?c=y&story=fullstory' - if subsection is not None: - subsection_title = self.tag_to_string(subsection).strip() - prefix = (subsection_title+': ') - description=self.tag_to_string(post('p', limit=2)[1]).strip() - else: - if post.find('img') is not None: - subsection_title = self.tag_to_string(post.findPrevious('div', attrs={'class':'departments plainModule'}).find('p', attrs={'class':'article-cat'})).strip() - prefix = (subsection_title+': ') - - description=self.tag_to_string(post.find('p')).strip() + description=self.tag_to_string(post.find('p')).strip() desc=re.sub('\sBy\s.*', '', description, re.DOTALL) author=re.sub('.*By\s', '', description, re.DOTALL) - title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author + title=self.tag_to_string(link).strip()+ u' (%s)'%author + self.log('\tFound article:', title) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) - if articles: - if section_title not in feeds: - feeds[section_title] = [] - feeds[section_title] += articles + if articles: + feeds[section_title] = articles + ans = [(key, val) for key, val in feeds.iteritems()] return ans diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe index 678ee5c640..e2f9e6834d 100644 --- a/recipes/spiders_web_pl.recipe +++ b/recipes/spiders_web_pl.recipe @@ -4,12 +4,15 @@ class SpidersWeb(BasicNewsRecipe): title = u"Spider's Web" oldest_article = 7 __author__ = 'fenuks' - description = u'' - cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' + description = u'Autorskie teksty popularnych blogerów, testy sprzętu i aplikacji, oraz wiele więcej.' + cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png' category = 'IT, WEB' language = 'pl' no_stylesheers=True + remove_javascript = True + use_embedded_content = False max_articles_per_feed = 100 - keep_only_tags=[dict(id='Post')] - remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')] + keep_only_tags=[dict(id='start')] + remove_tags_after = dict(attrs={'class':'padding20'}) + remove_tags=[dict(name='div', attrs={'class':['padding border-bottom', 'padding20', 'padding border-top']})] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] diff --git a/recipes/swiat_obrazu.recipe b/recipes/swiat_obrazu.recipe index 91748fb139..f088ff5e80 100644 --- a/recipes/swiat_obrazu.recipe +++ b/recipes/swiat_obrazu.recipe @@ -1,7 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Swiat_Obrazu(BasicNewsRecipe): - title = u'Swiat Obrazu' + title = u'Świat Obrazu' __author__ = 'fenuks' description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.' category = 'photography' diff --git a/recipes/swiatkindle.recipe b/recipes/swiatkindle.recipe index 9847d1359e..c589d1b6e1 100644 --- a/recipes/swiatkindle.recipe +++ b/recipes/swiatkindle.recipe @@ -19,7 +19,7 @@ class swiatczytnikow(BasicNewsRecipe): feeds = [(u'Świat Czytników - wpisy', u'http://swiatczytnikow.pl/feed')] - remove_tags = [dict(name = 'ul', attrs = {'class' : 'similar-posts'})] + remove_tags = [dict(name = 'ul', attrs = {'class' : 'similar-posts'}), + dict(name = 'div', attrs = {'class' : 'feedflare'})] preprocess_regexps = [(re.compile(u'

Czytaj dalej:

'), lambda match: '')] - diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index 1c3f46f967..97a44f81c7 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -3,7 +3,7 @@ import re class Tablety_pl(BasicNewsRecipe): title = u'Tablety.pl' __author__ = 'fenuks' - description = u'tablety.pl - latest tablet news' + description = u'Tablety, gry i aplikacje na tablety.' masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' category = 'IT' diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe index a615763307..6f37c17e7c 100644 --- a/recipes/tanuki.recipe +++ b/recipes/tanuki.recipe @@ -4,6 +4,7 @@ class tanuki(BasicNewsRecipe): title = u'Tanuki' oldest_article = 7 __author__ = 'fenuks' + description = u'Tanuki - portal o anime i mandze.' category = 'anime, manga' language = 'pl' max_articles_per_feed = 100 @@ -42,4 +43,4 @@ class tanuki(BasicNewsRecipe): a['href']='http://manga.tanuki.pl' + a['href'] elif 'tanuki-czytelnia' in soup.title.string.lower(): a['href']='http://czytelnia.tanuki.pl' + a['href'] - return soup \ No newline at end of file + return soup diff --git a/recipes/telepolis_pl.recipe b/recipes/telepolis_pl.recipe index ff4803697f..1aa7734c2c 100644 --- a/recipes/telepolis_pl.recipe +++ b/recipes/telepolis_pl.recipe @@ -3,65 +3,44 @@ __license__ = 'GPL v3' from calibre.web.feeds.news import BasicNewsRecipe -import re class telepolis(BasicNewsRecipe): title = u'Telepolis.pl' - __author__ = 'Artur Stachecki ' + __author__ = 'Artur Stachecki , Tomasz Długosz ' + language = 'pl' - description = u'Twój telekomunikacyjny serwis informacyjny.\ - Codzienne informacje, testy i artykuły,\ - promocje, baza telefonów oraz centrum rozrywki' - oldest_article = 7 + description = u'Twój telekomunikacyjny serwis informacyjny.' masthead_url = 'http://telepolis.pl/i/telepolis-logo2.gif' - max_articles_per_feed = 100 - simultaneous_downloads = 5 - remove_javascript = True no_stylesheets = True use_embedded_content = False - remove_tags = [] - remove_tags.append(dict(attrs={'alt': 'TELEPOLIS.pl'})) - - preprocess_regexps = [(re.compile(r'<: .*? :>'), - lambda match: ''), - (re.compile(r'Zobacz:.*?
', re.DOTALL), - lambda match: ''), - (re.compile(r'<-ankieta.*?>'), - lambda match: ''), - (re.compile(r'\(Q\!\)'), - lambda match: ''), - (re.compile(r'\(plik.*?\)'), - lambda match: ''), - (re.compile(r'', re.DOTALL), - lambda match: '') - ] - - extra_css = '''.tb { font-weight: bold; font-size: 20px;}''' - feeds = [ - (u'Wiadomości', u'http://www.telepolis.pl/rss/news.php'), - (u'Artykuły', u'http://www.telepolis.pl/rss/artykuly.php') + (u'Wiadomości', u'http://www.telepolis.pl/rss,2,5,0.html') ] - def print_version(self, url): - if 'news.php' in url: - print_url = url.replace('news.php', 'news_print.php') - else: - print_url = url.replace('artykuly.php', 'art_print.php') - return print_url + keep_only_tags = [ + dict(name='div', attrs={'class':'flol w510'}), + dict(name='div', attrs={'class':'main_tresc'}), + dict(name='div', attrs={'class':'main_tresc_news'}) + ] + + def append_page(self, soup, appendtag): + chpage= appendtag.find(attrs={'class':'str'}) + if chpage: + for page in chpage.findAll('a'): + if page.renderContents() == 'Następna ›': + break + soup2 = self.index_to_soup(page['href']) + pagetext = soup2.find(attrs={'class':'main_tresc'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'str'}): + r.extract() def preprocess_html(self, soup): + self.append_page(soup, soup.body) for image in soup.findAll('img'): if 'm.jpg' in image['src']: - image_big = image['src'] - image_big = image_big.replace('m.jpg', '.jpg') - image['src'] = image_big - logo = soup.find('tr') - logo.extract() - for tag in soup.findAll('tr'): - for strings in ['Wiadomość wydrukowana', 'copyright']: - if strings in self.tag_to_string(tag): - tag.extract() - return self.adeify_images(soup) + image['src'] = image['src'].replace('m.jpg', '.jpg') + return soup diff --git a/recipes/the_friday_times.recipe b/recipes/the_friday_times.recipe new file mode 100644 index 0000000000..2ca8443684 --- /dev/null +++ b/recipes/the_friday_times.recipe @@ -0,0 +1,26 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class TheFridayTimes(BasicNewsRecipe): + title = u'The Friday Times' + language = 'en_PK' + __author__ = 'Krittika Goyal' + + + no_stylesheets = True + no_javascript = True + auto_cleanup = True + + + def parse_index(self): + toc = self.index_to_soup('http://www.thefridaytimes.com/beta3/tft/index.php') + articles = [] + for story in toc.findAll('a', attrs={'class':'homemainlinks'}): + title = self.tag_to_string(story) + url = 'http://www.thefridaytimes.com/beta3/tft/' + story['href'] + self.log('Found article:', story) + self.log('\t', url) + articles.append({'title':title, 'url':url, 'date':'', + 'description':''}) + + return [('Current Issue', articles)] + diff --git a/recipes/trojmiasto_pl.recipe b/recipes/trojmiasto_pl.recipe new file mode 100644 index 0000000000..8ac35c2edb --- /dev/null +++ b/recipes/trojmiasto_pl.recipe @@ -0,0 +1,37 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class Trojmiasto(BasicNewsRecipe): + title = u'Tr\xf3jmiasto.pl' + __author__ = 'fenuks' + description = u'Wiadomości, imprezy, wydarzenia, spektakle.Gdańsk, Gdynia, Sopot - NOCLEGI, Katalog firm, repertuar kin, wydarzenia, przewodnik, mapa, kwatery, hotele. Portal regionalny trojmiasto.pl' + category = '' + #publication_type = '' + language = 'pl' + encoding = 'iso-8859-2' + extra_css = 'ul {list-style: none; padding:0; margin:0;}' + cover_url = 'http://www.trojmiasto.pl/_img/toplong2/logo_trojmiasto.gif' + #masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + preprocess_regexps = [(re.compile(ur'Czytaj więcej.*?', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też.*?', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(ur'[A-ZĄĆĘŁŃÓŚŹŻ \-,.:]*?', re.DOTALL), lambda match: ''),] + + #keep_only_tags = [] + remove_tags = [dict(id=['logo', 'font_small', 'font_big']), dict(attrs={'class':['title-long', 'ankieta', 'newsletter-inside-content newsletter-wrap', 'copyright_box', + 'logo', 'btn btn-photo-add', 'related-info-wrap', 'nTabs', 'article-list', 'rate-player horizontal', 'type-box', 'rate-player' + 'hover-nav', 'live-head tC', 'prev-link', 'next-link', 'ie6']}), dict(attrs={'title':[u'drukuj artykuł', u'podziel się na Facebooku', u'prześlij artykuł']})] + remove_tags_after = dict(attrs={'class':'author-wrap'}) + remove_tags_before = dict(attrs={'class':'text-container'}) + + feeds = [(u'Wszystkie', u'http://rss.trojmiasto.pl/rss,0.xml'), (u'Fakty i opinie', u'http://rss.trojmiasto.pl/rss,1.xml'), (u'Sport', u'http://rss.trojmiasto.pl/rss,2.xml'), (u'Dom', u'http://rss.trojmiasto.pl/rss,3.xml'), (u'Moto', u'http://rss.trojmiasto.pl/rss,4.xml'), (u'Nauka', u'http://rss.trojmiasto.pl/rss,5.xml'), (u'Rozrywka', u'http://rss.trojmiasto.pl/rss,6.xml'), (u'Kultura', u'http://rss.trojmiasto.pl/rss,7.xml'), (u'Rowery', u'http://rss.trojmiasto.pl/rss,8.xml'), (u'Dziecko', u'http://rss.trojmiasto.pl/rss,9.xml'), (u'Zdrowie i uroda', u'http://rss.trojmiasto.pl/rss,10.xml'), (u'Praca', u'http://rss.trojmiasto.pl/rss,11.xml'), (u'Artyku\u0142y czytelnik\xf3w', u'http://rss.trojmiasto.pl/rss,12.xml'), (u'Korki', u'http://rss.trojmiasto.pl/rss,13.xml'), (u'Historia', u'http://rss.trojmiasto.pl/rss,14.xml'), (u'Biznes', u'http://rss.trojmiasto.pl/rss,16.xml'), (u'Kryminalne Tr\xf3jmiasto', u'http://rss.trojmiasto.pl/rss,17.xml'), (u'Przewodnik', u'http://rss.trojmiasto.pl/rss,18.xml'), (u'Aktywne Tr\xf3jmiasto', u'http://rss.trojmiasto.pl/rss,19.xml'), (u'Delux', u'http://rss.trojmiasto.pl/rss,20.xml')] + + def print_version(self, url): + return url + '?print=1' diff --git a/recipes/tvn24.recipe b/recipes/tvn24.recipe index a5f5111770..bc48f91556 100644 --- a/recipes/tvn24.recipe +++ b/recipes/tvn24.recipe @@ -8,33 +8,33 @@ class tvn24(BasicNewsRecipe): description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata' category = 'news' language = 'pl' - masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' - cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' + #masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' + cover_url= 'http://www.qzdrowiu.pl/Upload/KnowQZdrowiu_PressOffice/TVN24_logo_575702b7-edce-4b6f-a41b-4395f9456f96_ff6d6ccf-528a-4b94-9e61-2fed727aba35.png' extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' remove_empty_feeds = True remove_javascript = True no_stylesheets = True keep_only_tags=[ -# dict(name='h1', attrs={'class':'size38 mt20 pb20'}), - dict(name='div', attrs={'class':'mainContainer'}), -# dict(name='p'), -# dict(attrs={'class':['size18 mt10 mb15', 'bold topicSize1', 'fromUsers content', 'textArticleDefault']}) +# dict(name='h1', attrs={'class':'size38 mt20 pb20'}), + dict(name='div', attrs={'class':'mainContainer'}), +# dict(name='p'), +# dict(attrs={'class':['size18 mt10 mb15', 'bold topicSize1', 'fromUsers content', 'textArticleDefault']}) ] remove_tags=[ - dict(attrs={'class':['commentsInfo', 'textSize', 'related newsNews align-right', 'box', 'watchMaterial text', 'related galleryGallery align-center', 'advert block-alignment-right', 'userActions', 'socialBookmarks', 'im yourArticle fl', 'dynamicButton addComment fl', 'innerArticleModule onRight cols externalContent', 'thumbsGallery', 'relatedObject customBlockquote align-right', 'lead', 'mainRightColumn', 'articleDateContainer borderGreyBottom', 'socialMediaContainer onRight loaded', 'quizContent', 'twitter', 'facebook', 'googlePlus', 'share', 'voteResult', 'reportTitleBar bgBlue_v4 mb15', 'innerVideoModule center']}), - dict(name='article', attrs={'class':['singleArtPhotoCenter', 'singleArtPhotoRight', 'singleArtPhotoLeft']}), - dict(name='section', attrs={'id':['forum', 'innerArticle', 'quiz toCenter', 'mb20']}), - dict(name='div', attrs={'class':'socialMediaContainer big p20 mb20 borderGrey loaded'}) - ] + dict(attrs={'class':['commentsInfo', 'textSize', 'related newsNews align-right', 'box', 'watchMaterial text', 'related galleryGallery align-center', 'advert block-alignment-right', 'userActions', 'socialBookmarks', 'im yourArticle fl', 'dynamicButton addComment fl', 'innerArticleModule onRight cols externalContent', 'thumbsGallery', 'relatedObject customBlockquote align-right', 'lead', 'mainRightColumn', 'articleDateContainer borderGreyBottom', 'socialMediaContainer onRight loaded', 'quizContent', 'twitter', 'facebook', 'googlePlus', 'share', 'voteResult', 'reportTitleBar bgBlue_v4 mb15', 'innerVideoModule center']}), + dict(name='article', attrs={'class':['singleArtPhotoCenter', 'singleArtPhotoRight', 'singleArtPhotoLeft']}), + dict(name='section', attrs={'id':['forum', 'innerArticle', 'quiz toCenter', 'mb20']}), + dict(name='div', attrs={'class':'socialMediaContainer big p20 mb20 borderGrey loaded'}) + ] remove_tags_after=[dict(name='li', attrs={'class':'share'})] feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), ] - #(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')] + #(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')] def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) + tstr = alink.string + alink.replaceWith(tstr) return soup def postprocess_html(self, soup, first): diff --git a/recipes/ubuntu_pl.recipe b/recipes/ubuntu_pl.recipe index 84912e44fa..4d2340ad84 100644 --- a/recipes/ubuntu_pl.recipe +++ b/recipes/ubuntu_pl.recipe @@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Ubuntu_pl(BasicNewsRecipe): title = u'UBUNTU.pl' __author__ = 'fenuks' - description = 'UBUNTU.pl - polish ubuntu community site' + description = 'Polskie forum użytkowników Ubuntu Linux. Projekty, porady i dyskusje, gotowe rozwiązania problemów.' masthead_url= 'http://ubuntu.pl/img/logo.jpg' cover_url = 'http://ubuntu.pl/img/logo.jpg' category = 'linux, IT' diff --git a/recipes/ubuntu_pomoc_org.recipe b/recipes/ubuntu_pomoc_org.recipe new file mode 100644 index 0000000000..71dba95af8 --- /dev/null +++ b/recipes/ubuntu_pomoc_org.recipe @@ -0,0 +1,23 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class UbuntuPomoc(BasicNewsRecipe): + title = u'Ubuntu-pomoc.org' + __author__ = 'fenuks' + description = u'Strona poświęcona systemowi Ubuntu Linux. Znajdziesz tutaj przydatne i sprawdzone poradniki oraz sposoby rozwiązywania wielu popularnych problemów. Ten blog rozwiąże każdy Twój problem - jeśli nie teraz, to wkrótce! :)' + category = 'Linux, Ubuntu, open source' + language = 'pl' + cover_url = 'http://www.ubuntu-pomoc.org/grafika/ubuntupomoc.png' + preprocess_regexps = [(re.compile(r'
.+', re.IGNORECASE|re.DOTALL), lambda m: '')] + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + use_embedded_content = False + remove_attrs = ['style'] + keep_only_tags = [dict(name='article')] + #remove_tags_after = dict(attrs={'class':'underEntry'}) + remove_tags = [dict(attrs={'class':['yarpp-related', 'youtube_sc', 'share']}), dict(name='footer')] + feeds = [(u'Ca\u0142o\u015b\u0107', u'http://feeds.feedburner.com/Ubuntu-Pomoc'), + ] diff --git a/recipes/unperiodico.recipe b/recipes/unperiodico.recipe new file mode 100644 index 0000000000..b40b6c372e --- /dev/null +++ b/recipes/unperiodico.recipe @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# https://github.com/iemejia/calibrecolombia + +''' +http://www.unperiodico.unal.edu.co/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class UNPeriodico(BasicNewsRecipe): + title = u'UN Periodico' + language = 'es_CO' + __author__ = 'Ismael Mejia ' + cover_url = 'http://www.unperiodico.unal.edu.co/fileadmin/templates/periodico/img/logoperiodico.png' + description = 'UN Periodico' + oldest_article = 30 + max_articles_per_feed = 100 + publication_type = 'newspaper' + feeds = [ + (u'UNPeriodico', u'http://www.unperiodico.unal.edu.co/rss/type/rss2/') + ] diff --git a/recipes/volksrant.recipe b/recipes/volksrant.recipe index e5499fed73..fa143c97ad 100644 --- a/recipes/volksrant.recipe +++ b/recipes/volksrant.recipe @@ -41,17 +41,9 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe): ####################################################################################################### temp_files = [] articles_are_obfuscated = True - use_javascript_to_login = True - - def javascript_login(self, br, username, password): - 'Volksrant wants the user to explicitly allow cookies' - if not br.visit('http://www.volkskrant.nl'): - raise Exception('Failed to connect to volksrant website') - br.click('#pop_cookie_text a[onclick]', wait_for_load=True, timeout=120) def get_obfuscated_article(self, url): br = self.browser.clone_browser() - print 'THE CURRENT URL IS: ', url br.open(url) year = date.today().year diff --git a/recipes/websecurity_pl.recipe b/recipes/websecurity_pl.recipe new file mode 100644 index 0000000000..85995b01bf --- /dev/null +++ b/recipes/websecurity_pl.recipe @@ -0,0 +1,28 @@ +__license__ = 'GPL v3' +from calibre.web.feeds.news import BasicNewsRecipe + +class WebSecurity(BasicNewsRecipe): + title = u'WebSecurity' + __author__ = 'fenuks' + description = u'WebSecurity.pl to największy w Polsce portal o bezpieczeństwie sieciowym.' + category = '' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://websecurity.pl/images/websecurity-logo.png' + masthead_url = '' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'article single'}), dict(id='content')] + remove_tags = [dict(attrs={'class':['sociable', 'no-comments']})] + remove_tags_after = dict(attrs={'class':'sociable'}) + feeds = [(u'Wszystkie', u'http://websecurity.pl/feed/'), (u'Aktualno\u015bci', u'http://websecurity.pl/aktualnosci/feed/'), (u'Artyku\u0142y', u'http://websecurity.pl/artykuly/feed/'), (u'Blogosfera', u'http://websecurity.pl/blogosfera/wpisy/feed/')] + diff --git a/recipes/what_if.recipe b/recipes/what_if.recipe new file mode 100644 index 0000000000..1d642353d2 --- /dev/null +++ b/recipes/what_if.recipe @@ -0,0 +1,24 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class XkcdCom(BasicNewsRecipe): + cover_url = 'http://what-if.xkcd.com/imgs/whatif-logo.png' + masthead_url = 'http://what-if.xkcd.com/imgs/whatif-logo.png' + __author__ = 'kisnik' + title = 'What If...' + description = 'The "What If" feed from xkcd' + language = 'en' + keep_only_tags = [dict(name='article')] + + use_embedded_content = False + oldest_article = 60 + # add image and text + # add an horizontal line after the question + preprocess_regexps = [ + (re.compile(r'()'), + lambda m: '
%s%s

(%s)

' % (m.group(1), m.group(3), m.group(2))), + (re.compile(r'([^>]+

)'), + lambda n: '%s
' % (n.group(1))), + ] + + extra_css = "#photo_text{font-size:small;}" diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe index 2adac1e113..90dde251ca 100644 --- a/recipes/wprost.recipe +++ b/recipes/wprost.recipe @@ -10,89 +10,89 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Wprost(BasicNewsRecipe): - EDITION = 0 - FIND_LAST_FULL_ISSUE = True - EXCLUDE_LOCKED = True - ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EXCLUDE_LOCKED = True + ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + title = u'Wprost' + __author__ = 'matek09' + description = 'Weekly magazine' + encoding = 'ISO-8859-2' + no_stylesheets = True + language = 'pl' + remove_javascript = True + recursions = 0 + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + ''' + keep_only_tags =[] + keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'})) + ''' - title = u'Wprost' - __author__ = 'matek09' - description = 'Weekly magazine' - encoding = 'ISO-8859-2' - no_stylesheets = True - language = 'pl' - remove_javascript = True - recursions = 0 - - remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - - '''keep_only_tags =[] - keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' - - preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), + preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), (re.compile(r'display: block;'), lambda match: ''), (re.compile(r'\\\<\/table\>'), lambda match: ''), (re.compile(r'\

'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\
'), lambda match: ''), - (re.compile(r'\