mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to trunk.
This commit is contained in:
commit
b261ba7ef3
12
COPYRIGHT
12
COPYRIGHT
@ -9,6 +9,12 @@ License: GPL-2 or later
|
|||||||
The full text of the GPL is distributed as in
|
The full text of the GPL is distributed as in
|
||||||
/usr/share/common-licenses/GPL-2 on Debian systems.
|
/usr/share/common-licenses/GPL-2 on Debian systems.
|
||||||
|
|
||||||
|
Files: setup/iso_639/*
|
||||||
|
Copyright: Various
|
||||||
|
License: LGPL 2.1
|
||||||
|
The full text of the LGPL is distributed as in
|
||||||
|
/usr/share/common-licenses/LGPL-2.1 on Debian systems.
|
||||||
|
|
||||||
Files: src/calibre/ebooks/BeautifulSoup.py
|
Files: src/calibre/ebooks/BeautifulSoup.py
|
||||||
Copyright: Copyright (c) 2004-2007, Leonard Richardson
|
Copyright: Copyright (c) 2004-2007, Leonard Richardson
|
||||||
License: BSD
|
License: BSD
|
||||||
@ -28,6 +34,12 @@ License: other
|
|||||||
are permitted in any medium without royalty provided the copyright
|
are permitted in any medium without royalty provided the copyright
|
||||||
notice and this notice are preserved.
|
notice and this notice are preserved.
|
||||||
|
|
||||||
|
Files: src/calibre/ebooks/readability/*
|
||||||
|
Copyright: Unknown
|
||||||
|
License: Apache 2.0
|
||||||
|
The full text of the Apache 2.0 license is available at:
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
Files: /src/cherrypy/*
|
Files: /src/cherrypy/*
|
||||||
Copyright: Copyright (c) 2004-2007, CherryPy Team (team@cherrypy.org)
|
Copyright: Copyright (c) 2004-2007, CherryPy Team (team@cherrypy.org)
|
||||||
Copyright: Copyright (C) 2005, Tiago Cogumbreiro <cogumbreiro@users.sf.net>
|
Copyright: Copyright (C) 2005, Tiago Cogumbreiro <cogumbreiro@users.sf.net>
|
||||||
|
@ -28,11 +28,12 @@ class CNN(BasicNewsRecipe):
|
|||||||
(re.compile(r'<style.*?</style>', re.DOTALL), lambda m: ''),
|
(re.compile(r'<style.*?</style>', re.DOTALL), lambda m: ''),
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags = [dict(id='cnnContentContainer')]
|
keep_only_tags = [dict(id=['cnnContentContainer', 'storycontent'])]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
{'class':['cnn_strybtntools', 'cnn_strylftcntnt',
|
{'class':['cnn_strybtntools', 'cnn_strylftcntnt',
|
||||||
'cnn_strybtntools', 'cnn_strybtntoolsbttm', 'cnn_strybtmcntnt',
|
'cnn_strybtntools', 'cnn_strybtntoolsbttm', 'cnn_strybtmcntnt',
|
||||||
'cnn_strycntntrgt']},
|
'cnn_strycntntrgt', 'hed_side', 'foot']},
|
||||||
|
dict(id=['ie_column']),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
#import re # Provides preprocess_regexps re.compile
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class FairbanksDailyNewsminer(BasicNewsRecipe):
|
class FairbanksDailyNewsminer(BasicNewsRecipe):
|
||||||
@ -8,21 +6,28 @@ class FairbanksDailyNewsminer(BasicNewsRecipe):
|
|||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
description = ''''The voice of interior Alaska since 1903'''
|
description = 'The voice of interior Alaska since 1903'
|
||||||
publisher = 'http://www.newsminer.com/'
|
publisher = 'http://www.newsminer.com/'
|
||||||
category = 'news, Alaska, Fairbanks'
|
category = 'news, Alaska, Fairbanks'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
#extra_css = '''
|
|
||||||
# p{font-weight: normal;text-align: justify}
|
# Make article titles, author and date bold, italic or small font.
|
||||||
# '''
|
# http://assets.matchbin.com/sites/635/stylesheets/newsminer.com.css
|
||||||
|
# (signature_line contains date, views, comments)
|
||||||
|
extra_css = '''
|
||||||
|
.story_item_headline { font-size: medium; font-weight: bold; }
|
||||||
|
.story_item_author { font-size: small; font-style:italic; }
|
||||||
|
.signature_line { font-size: small; }
|
||||||
|
'''
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
# TODO: I don't see any photos in my Mobi file with this masterhead_url!
|
|
||||||
|
# TODO: The News-miner cover image seems a bit small. Can this be enlarged by 10-30%?
|
||||||
masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg'
|
masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg'
|
||||||
|
|
||||||
|
|
||||||
@ -31,6 +36,10 @@ class FairbanksDailyNewsminer(BasicNewsRecipe):
|
|||||||
# manual processing is needed to get just the "story_item_date updated"
|
# manual processing is needed to get just the "story_item_date updated"
|
||||||
# (which contains the date). Everything else on this line is pretty much not needed.
|
# (which contains the date). Everything else on this line is pretty much not needed.
|
||||||
#
|
#
|
||||||
|
# Currently, you will see the following:
|
||||||
|
# | Aug 24, 2011 | 654 views | 6 | |
|
||||||
|
# (ie. 6 comments)
|
||||||
|
#
|
||||||
# HTML line containing story_item_date:
|
# HTML line containing story_item_date:
|
||||||
# <div class="signature_line"><span title="2011-08-22T23:37:14Z" class="story_item_date updated">Aug 22, 2011</span> | 2370 views | 52 <a href="/pages/full_story/push?article-Officials+tout+new+South+Cushman+homeless+living+facility%20&id=15183753#comments_15183753"><img alt="52 comments" class="dont_touch_me" src="http://d2uh5w9wm14i0w.cloudfront.net/images/comments-icon.gif" title="52 comments" /></a> | <span id="number_recommendations_15183753" class="number_recommendations">9</span> <a href="#1" id="recommend_link_15183753" onclick="Element.remove('recommend_link_15183753'); new Ajax.Request('/community/content/recommend/15183753', {asynchronous:true, evalScripts:true}); return false;"><img alt="9 recommendations" class="dont_touch_me" src="http://d2uh5w9wm14i0w.cloudfront.net/images/thumbs-up-icon.gif" title="9 recommendations" /></a> | <a href="#1" onclick="$j.facebox({ajax: '/community/content/email_friend_pane/15183753'}); return false;"><span style="position: relative;"><img alt="email to a friend" class="dont_touch_me" src="http://d2uh5w9wm14i0w.cloudfront.net/images/email-this.gif" title="email to a friend" /></span></a> | <span><a href="/printer_friendly/15183753" target="_blank"><img alt="print" class="dont_touch_me" src="http://d2uh5w9wm14i0w.cloudfront.net/images/print_icon.gif" title="print" /></a></span><span id="email_content_message_15183753" class="signature_email_message"></span></div>
|
# <div class="signature_line"><span title="2011-08-22T23:37:14Z" class="story_item_date updated">Aug 22, 2011</span> | 2370 views | 52 <a href="/pages/full_story/push?article-Officials+tout+new+South+Cushman+homeless+living+facility%20&id=15183753#comments_15183753"><img alt="52 comments" class="dont_touch_me" src="http://d2uh5w9wm14i0w.cloudfront.net/images/comments-icon.gif" title="52 comments" /></a> | <span id="number_recommendations_15183753" class="number_recommendations">9</span> <a href="#1" id="recommend_link_15183753" onclick="Element.remove('recommend_link_15183753'); new Ajax.Request('/community/content/recommend/15183753', {asynchronous:true, evalScripts:true}); return false;"><img alt="9 recommendations" class="dont_touch_me" src="http://d2uh5w9wm14i0w.cloudfront.net/images/thumbs-up-icon.gif" title="9 recommendations" /></a> | <a href="#1" onclick="$j.facebox({ajax: '/community/content/email_friend_pane/15183753'}); return false;"><span style="position: relative;"><img alt="email to a friend" class="dont_touch_me" src="http://d2uh5w9wm14i0w.cloudfront.net/images/email-this.gif" title="email to a friend" /></span></a> | <span><a href="/printer_friendly/15183753" target="_blank"><img alt="print" class="dont_touch_me" src="http://d2uh5w9wm14i0w.cloudfront.net/images/print_icon.gif" title="print" /></a></span><span id="email_content_message_15183753" class="signature_email_message"></span></div>
|
||||||
|
|
||||||
@ -40,73 +49,49 @@ class FairbanksDailyNewsminer(BasicNewsRecipe):
|
|||||||
|
|
||||||
#preprocess_regexps = [(re.compile(r'<span[^>]*addthis_separator*>'), lambda match: '') ]
|
#preprocess_regexps = [(re.compile(r'<span[^>]*addthis_separator*>'), lambda match: '') ]
|
||||||
#preprocess_regexps = [(re.compile(r'span class="addthis_separator">|</span>'), lambda match: '') ]
|
#preprocess_regexps = [(re.compile(r'span class="addthis_separator">|</span>'), lambda match: '') ]
|
||||||
|
|
||||||
#preprocess_regexps = [
|
#preprocess_regexps = [
|
||||||
# (re.compile(r'<start>.*?<end>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
# (re.compile(r'<start>.*?<end>', re.IGNORECASE | re.DOTALL), lambda match : ''),
|
||||||
# ]
|
# ]
|
||||||
|
|
||||||
#def get_browser(self):
|
#def get_browser(self):
|
||||||
#def preprocess_html(soup, first_fetch):
|
#def preprocess_html(soup, first_fetch):
|
||||||
# date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'}))
|
# date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'}))
|
||||||
# return
|
# return
|
||||||
|
|
||||||
|
#preprocess_regexps = [(re.compile(r' |.*?', re.DOTALL), lambda m: '')]
|
||||||
|
|
||||||
|
|
||||||
# Try to keep some tags - some might not be needed here
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
#date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})),
|
#dict(name='div', attrs={'class':'hnews hentry item'}),
|
||||||
dict(name='div', attrs={'class':'hnews hentry item'}),
|
|
||||||
dict(name='div', attrs={'class':'story_item_headline entry-title'}),
|
dict(name='div', attrs={'class':'story_item_headline entry-title'}),
|
||||||
|
#dict(name='div', attrs={'class':'story_item_author'}),
|
||||||
#dict(name='span', attrs={'class':'story_item_date updated'}),
|
#dict(name='span', attrs={'class':'story_item_date updated'}),
|
||||||
|
#dict(name='div', attrs={'class':'story_item_author'}),
|
||||||
dict(name='div', attrs={'class':'full_story'})
|
dict(name='div', attrs={'class':'full_story'})
|
||||||
]
|
]
|
||||||
#remove_tags = [
|
|
||||||
# dict(name='div', attrs={'class':'story_tools'}),
|
|
||||||
# dict(name='p', attrs={'class':'ad_label'}),
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# Try to remove some bothersome tags
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
|
# Try getting rid of some signature_line (date line) stuff
|
||||||
#dict(name='img', attrs={'alt'}),
|
#dict(name='img', attrs={'alt'}),
|
||||||
dict(name='img', attrs={'class':'dont_touch_me'}),
|
dict(name='img', attrs={'class':'dont_touch_me'}),
|
||||||
dict(name='span', attrs={'class':'number_recommendations'}),
|
dict(name='span', attrs={'class':'number_recommendations'}),
|
||||||
#dict(name='div', attrs={'class':'signature_line'}),
|
#dict(name='div', attrs={'class':'signature_line'}),
|
||||||
|
|
||||||
|
# Removes div within <!-- AddThis Button BEGIN --> <!-- AddThis Button END -->
|
||||||
dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}),
|
dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}),
|
||||||
dict(name='div', attrs={'class':['addthis_toolbox','addthis_default_style']}),
|
|
||||||
dict(name='span', attrs={'class':'addthis_separator'}),
|
|
||||||
dict(name='div', attrs={'class':'related_content'}),
|
dict(name='div', attrs={'class':'related_content'}),
|
||||||
dict(name='div', attrs={'class':'comments_container'}),
|
|
||||||
#dict(name='div', attrs={'class':'signature_line'}),
|
|
||||||
dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}),
|
|
||||||
dict(name='div', attrs={'id':'comments_container'})
|
dict(name='div', attrs={'id':'comments_container'})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# This one works but only gets title, date and clips article content!
|
|
||||||
#remove_tags_after = [
|
|
||||||
# dict(name='span', attrs={'class':'story_item_date updated'})
|
|
||||||
# ]
|
|
||||||
|
|
||||||
#remove_tags_after = [
|
|
||||||
# dict(name='div', attrs={'class':'advertisement'}),
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# Try clipping tags before and after to prevent pulling img views/posts numbers after date?
|
|
||||||
#remove_tags_before = [
|
|
||||||
# dict(name='span', attrs={'class':'story_item_date updated'})
|
|
||||||
# ]
|
|
||||||
|
|
||||||
#extra_css # tweak the appearance # TODO: Change article titles <h2?> to bold?
|
|
||||||
|
|
||||||
|
|
||||||
# Comment-out or uncomment any of the following RSS feeds according to your
|
# Comment-out or uncomment any of the following RSS feeds according to your
|
||||||
# liking.
|
# liking.
|
||||||
#
|
#
|
||||||
# TODO: Adding more then one RSS Feed, and newline will be omitted for
|
# TODO: Some random bits of text might be trailing the last page (or TOC on
|
||||||
# entries within the Table of Contents or Index of Articles
|
# MOBI files), these are bits of public posts and comments and need to also
|
||||||
#
|
# be removed.
|
||||||
# TODO: Some random bits of text is trailing the last page (or TOC on MOBI
|
|
||||||
# files), these are bits of public posts and comments and need to also be
|
|
||||||
# removed.
|
|
||||||
#
|
#
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'),
|
(u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'),
|
||||||
@ -114,15 +99,15 @@ class FairbanksDailyNewsminer(BasicNewsRecipe):
|
|||||||
(u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'),
|
(u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'),
|
||||||
(u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'),
|
(u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'),
|
||||||
(u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'),
|
(u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'),
|
||||||
# (u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'),
|
(u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'),
|
||||||
(u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'),
|
(u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'),
|
||||||
# (u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'),
|
(u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'),
|
||||||
# (u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'),
|
#(u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'),
|
||||||
(u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'),
|
(u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'),
|
||||||
# (u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'),
|
(u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'),
|
||||||
# (u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'),
|
(u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'),
|
||||||
# (u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'),
|
#(u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'),
|
||||||
# (u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'),
|
(u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'),
|
||||||
(u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin')
|
#(u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
86
recipes/hackernews.recipe
Normal file
86
recipes/hackernews.recipe
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
'''
|
||||||
|
Hacker News
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
from urlparse import urlparse
|
||||||
|
|
||||||
|
class HackerNews(BasicNewsRecipe):
|
||||||
|
title = 'Hacker News'
|
||||||
|
__author__ = 'Tom Scholl'
|
||||||
|
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
|
||||||
|
publisher = 'Y Combinator'
|
||||||
|
category = 'news, programming, it, technology'
|
||||||
|
masthead_url = 'http://i55.tinypic.com/2u6io76.png'
|
||||||
|
cover_url = 'http://i55.tinypic.com/2u6io76.png'
|
||||||
|
delay = 1
|
||||||
|
max_articles_per_feed = 30
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
language = 'en'
|
||||||
|
requires_version = (0,8,16)
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Hacker News', 'http://news.ycombinator.com/rss')
|
||||||
|
]
|
||||||
|
|
||||||
|
temp_files = []
|
||||||
|
articles_are_obfuscated = True
|
||||||
|
|
||||||
|
def get_readable_content(self, url):
|
||||||
|
self.log('get_readable_content(' + url + ')')
|
||||||
|
br = self.get_browser()
|
||||||
|
f = br.open(url)
|
||||||
|
html = f.read()
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
return self.extract_readable_article(html, url)
|
||||||
|
|
||||||
|
def get_hn_content(self, url):
|
||||||
|
self.log('get_hn_content(' + url + ')')
|
||||||
|
# this could be improved
|
||||||
|
br = self.get_browser()
|
||||||
|
f = br.open(url)
|
||||||
|
html = f.read()
|
||||||
|
f.close()
|
||||||
|
return html
|
||||||
|
|
||||||
|
def get_obfuscated_article(self, url):
|
||||||
|
if url.startswith('http://news.ycombinator.com'):
|
||||||
|
content = self.get_hn_content(url)
|
||||||
|
else:
|
||||||
|
# TODO: use content-type header instead of url
|
||||||
|
is_image = False
|
||||||
|
for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]:
|
||||||
|
if url.endswith(ext):
|
||||||
|
is_image = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_image:
|
||||||
|
self.log('using image_content (' + url + ')')
|
||||||
|
content = u'<html><body><img src="' + url + u'"></body></html>'
|
||||||
|
else:
|
||||||
|
content = self.get_readable_content(url)
|
||||||
|
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
|
self.temp_files[-1].write(content)
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
return self.temp_files[-1].name
|
||||||
|
|
||||||
|
def is_link_wanted(self, url, tag):
|
||||||
|
if url.endswith('.pdf'):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def prettyify_url(self, url):
|
||||||
|
return urlparse(url).hostname
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
article.text_summary = self.prettyify_url(article.url)
|
||||||
|
article.summary = article.text_summary
|
||||||
|
|
||||||
|
|
63
recipes/samanyolu_haber.recipe
Normal file
63
recipes/samanyolu_haber.recipe
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class SHaber (BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Samanyolu Haber'
|
||||||
|
__author__ = u'thomass'
|
||||||
|
description = ' Samanyolu Haber Sitesinden günlük haberler '
|
||||||
|
oldest_article =2
|
||||||
|
max_articles_per_feed =100
|
||||||
|
no_stylesheets = True
|
||||||
|
#delay = 1
|
||||||
|
#use_embedded_content = False
|
||||||
|
encoding = 'utf-8'
|
||||||
|
publisher = 'thomass'
|
||||||
|
category = 'güncel, haber, türkçe'
|
||||||
|
language = 'tr'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'publisher' : publisher
|
||||||
|
,'linearize_tables': True
|
||||||
|
}
|
||||||
|
extra_css = ' .Haber-Baslik-Yazisi {font-weight: bold; font-size: 9px} .Haber-Ozet-Yazisi{ font-family:sans-serif;font-weight: normal;font-size: 11px } #Haber{ font-family:sans-serif;font-weight: normal;font-size: 9px }.KirmiziText{ font-weight: normal;font-size: 5px }' #.story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||||
|
|
||||||
|
#extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':['Haber-Baslik-Yazisi','Haber-Ozet-Yazisi']}),dict(name='div', attrs={'id':['ctl00_ContentPlaceHolder1_imagenew','Haber']})]#,dict(name='h6', attrs={'class':['KirmiziText',]}) dict(name='div', attrs={'id':['Haber']}),dict(name='div', attrs={'id':['gallery']})]
|
||||||
|
#remove_tags = [dict(name='img', attrs={'src':[ 'http://medya.aksiyon.com.tr/aksiyon/images/logo/logo.bmp','/aksiyon/images/template/green/baslik0.gif','mobile/home.jpg']}) ],dict(name='h1', attrs={'class':['H1-Haber-DetayBasligi']}),dict(name='h4', attrs={'class':['BrownText']}) ,
|
||||||
|
|
||||||
|
cover_img_url = 'http://static.samanyoluhaber.com/Images/resources/images/samanyoluhaber-yazi-logo.png'
|
||||||
|
masthead_url = 'http://static.samanyoluhaber.com/Images/resources/images/samanyoluhaber-yazi-logo.png'
|
||||||
|
remove_empty_feeds= True
|
||||||
|
#remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
( u'Son Dakika', u'http://podcast.samanyoluhaber.com/sondakika.rss'),
|
||||||
|
( u'Gündem', u'http://podcast.samanyoluhaber.com/gundem.rss'),
|
||||||
|
( u'Politika ', u'http://podcast.samanyoluhaber.com/politika.rss'),
|
||||||
|
( u'Ekonomi', u'http://podcast.samanyoluhaber.com/ekonomi.rss'),
|
||||||
|
( u'Dünya', u'http://podcast.samanyoluhaber.com/dunya.rss'),
|
||||||
|
( u'Spor ', u'http://podcast.samanyoluhaber.com/spor.rss'),
|
||||||
|
( u'Sağlık', u'http://podcast.samanyoluhaber.com/saglik.rss'),
|
||||||
|
( u'Kültür', u'http://podcast.samanyoluhaber.com/kultur.rss'),
|
||||||
|
#( u'Teknoloji ', u'http://podcast.samanyoluhaber.com/teknoloji.rss'),
|
||||||
|
( u'Eğitim', u'http://podcast.samanyoluhaber.com/egitim.rss'),
|
||||||
|
( u'Ramazan', u'http://podcast.samanyoluhaber.com/ramazan.rss'),
|
||||||
|
( u'Yazarlar ', u'http://podcast.samanyoluhaber.com/yazarlar.rss'),
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
]
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
||||||
|
# def print_version(self, url):
|
||||||
|
# return url.replace('http://www.aksiyon.com.tr/aksiyon/newsDetail_getNewsById.action?load=detay&', 'http://www.aksiyon.com.tr/aksiyon/mobile_detailn.action?')
|
||||||
|
|
55
recipes/samanyolu_teknoloji.recipe
Normal file
55
recipes/samanyolu_teknoloji.recipe
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class SHaberTekno (BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Samanyolu Teknoloji'
|
||||||
|
__author__ = u'thomass'
|
||||||
|
description = 'Samanyolu Teknoloji Haber Sitesinden haberler '
|
||||||
|
oldest_article =8
|
||||||
|
max_articles_per_feed =100
|
||||||
|
no_stylesheets = True
|
||||||
|
#delay = 1
|
||||||
|
#use_embedded_content = False
|
||||||
|
encoding = 'utf-8'
|
||||||
|
publisher = 'thomass'
|
||||||
|
category = 'bilim, teknoloji, haber, türkçe'
|
||||||
|
language = 'tr'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'publisher' : publisher
|
||||||
|
,'linearize_tables': True
|
||||||
|
}
|
||||||
|
extra_css = ' .IcerikMetin{ font-family:sans-serif;font-weight: normal;font-size: 10px } .h1IcerikBaslik {font-weight: bold; font-size: 18px}' #.story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||||
|
|
||||||
|
#extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':['IcerikBaslik','IcerikMetinDiv']})]#,dict(name='span', attrs={'class':['agenda2Title']}),dict(name='div', attrs={'id':['gallery']})]
|
||||||
|
#remove_tags = [dict(name='img', attrs={'src':[ 'http://medya.aksiyon.com.tr/aksiyon/images/logo/logo.bmp','/aksiyon/images/template/green/baslik0.gif','mobile/home.jpg']}) ]
|
||||||
|
|
||||||
|
cover_img_url = 'http://teknoloji.samanyoluhaber.com/resources/images/logo_s_digi.jpg'
|
||||||
|
masthead_url = 'http://teknoloji.samanyoluhaber.com/resources/images/logo_s_digi.jpg'
|
||||||
|
remove_empty_feeds= True
|
||||||
|
#remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
( u'GENEL', u'http://podcast.samanyoluhaber.com/Teknoloji.rss'),
|
||||||
|
( u'İNTERNET', u'http://open.dapper.net/services/shaberteknolojiinternet'),
|
||||||
|
( u'CEP TELEFONU', u'http://open.dapper.net/services/shaberteknolojicep'),
|
||||||
|
( u'OYUN', u'http://open.dapper.net/services/shaberteknolojioyun'),
|
||||||
|
( u'DONANIM', u'http://open.dapper.net/services/httpopendappernetservicesshaberteknolojidonanim'),
|
||||||
|
( u'ÜRÜN İNCELEME', u'http://open.dapper.net/services/shaberteknolojiurun'),
|
||||||
|
( u'ALIŞVERİŞ', u'http://open.dapper.net/services/shaberteknolojialisveris'),
|
||||||
|
( u'BİLİM & TEKNOLOJİ', u'http://open.dapper.net/services/shaberteknolojibilim'),
|
||||||
|
( u'HABERLER', u'http://open.dapper.net/services/shaberteknolojihaber'),
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
# def print_version(self, url):
|
||||||
|
# return url.replace('http://www.aksiyon.com.tr/aksiyon/newsDetail_getNewsById.action?load=detay&', 'http://www.aksiyon.com.tr/aksiyon/mobile_detailn.action?')
|
||||||
|
|
67
recipes/star_gazetesi.recipe
Normal file
67
recipes/star_gazetesi.recipe
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Star (BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Star Gazetesi'
|
||||||
|
__author__ = u'thomass'
|
||||||
|
description = 'yeni Türkiye''nin Gazetesi'
|
||||||
|
oldest_article =2
|
||||||
|
max_articles_per_feed =100
|
||||||
|
no_stylesheets = True
|
||||||
|
#delay = 1
|
||||||
|
#use_embedded_content = False
|
||||||
|
encoding = 'utf-8'
|
||||||
|
publisher = 'thomass'
|
||||||
|
category = 'güncel, haber, türkçe'
|
||||||
|
language = 'tr'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'publisher' : publisher
|
||||||
|
,'linearize_tables': True
|
||||||
|
}
|
||||||
|
extra_css = ' .font8{font-weight: bold; font-size:20px}.font11{font-weight: normal; font-size:small}#hdetay{ font-family:sans-serif;font-size: 9px }' #.story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||||
|
|
||||||
|
#extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':['font8']}),dict(name='span', attrs={'class':['font11']}),dict(name='div', attrs={'id':['hdetay']})]#,,dict(name='h6', attrs={'class':['KirmiziText']}) dict(name='div', attrs={'id':['Haber']}),dict(name='div', attrs={'id':['gallery']})]
|
||||||
|
#remove_tags = [dict(name='img', attrs={'src':[ 'http://medya.aksiyon.com.tr/aksiyon/images/logo/logo.bmp','/aksiyon/images/template/green/baslik0.gif','mobile/home.jpg']}) ],dict(name='h1', attrs={'class':['H1-Haber-DetayBasligi']}),dict(name='h4', attrs={'class':['BrownText']}) ,
|
||||||
|
|
||||||
|
cover_img_url = 'http://www.stargazete.com/starnew/img/starlogo.png'
|
||||||
|
masthead_url = 'http://www.stargazete.com/starnew/img/starlogo.png'
|
||||||
|
remove_empty_feeds= True
|
||||||
|
#remove_attributes = ['width','height']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
|
||||||
|
|
||||||
|
( u'MANSET', u'http://open.dapper.net/services/starmanset'),
|
||||||
|
( u'GÜNCEL', u'http://www.stargazete.com/guncel.xml'),
|
||||||
|
( u'POLİTİKA', u'http://www.stargazete.com/politika.xml'),
|
||||||
|
( u' EKONOMİ', u'http://www.stargazete.com/ekonomi.xml'),
|
||||||
|
( u'DÜNYA', u'http://www.stargazete.com/dunya.xml'),
|
||||||
|
( u'YAZARLAR', u'http://www.stargazete.com/gazeteyazarlar.xml'),
|
||||||
|
( u'SPOR', u'http://www.stargazete.com/spor.xml'),
|
||||||
|
( u'SPOR YAZARLARI', u'http://www.stargazete.com/index.php?metot=rss&islem=sporyazarlar'),
|
||||||
|
( u'SİNEMA', u'http://www.stargazete.com/sinema.xml'),
|
||||||
|
( u'KADIN&SAĞLIK', u'http://www.stargazete.com/kadinsaglik.xml'),
|
||||||
|
( u' STARTEK', u'http://www.stargazete.com/startek.xml'),
|
||||||
|
( u' AÇIK GÖRÜŞ', u'http://www.stargazete.com/acikgorus.xml'),
|
||||||
|
( u'Star PAZAR', u'http://www.stargazete.com/pazar.xml'),
|
||||||
|
( u'Star CUMARTESİ', u'http://www.stargazete.com/cumartesi.xml'),
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
]
|
||||||
|
def preprocess_html(self, soup):#remove links
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
||||||
|
#def print_version(self, url):
|
||||||
|
#return url.replace('/', 'http://www.stargazete.com/')
|
||||||
|
|
2169
setup/iso639.xml
2169
setup/iso639.xml
File diff suppressed because it is too large
Load Diff
3
setup/iso_639/README
Normal file
3
setup/iso_639/README
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
These files are taken from the iso-codes package, licensed under the LGPL 2.1
|
||||||
|
|
||||||
|
All the files are from the iso_639_3 sub-directory.
|
34266
setup/iso_639/af.po
Normal file
34266
setup/iso_639/af.po
Normal file
File diff suppressed because it is too large
Load Diff
33173
setup/iso_639/am.po
Normal file
33173
setup/iso_639/am.po
Normal file
File diff suppressed because it is too large
Load Diff
33365
setup/iso_639/ar.po
Normal file
33365
setup/iso_639/ar.po
Normal file
File diff suppressed because it is too large
Load Diff
33394
setup/iso_639/az.po
Normal file
33394
setup/iso_639/az.po
Normal file
File diff suppressed because it is too large
Load Diff
30849
setup/iso_639/bg.po
Normal file
30849
setup/iso_639/bg.po
Normal file
File diff suppressed because it is too large
Load Diff
33998
setup/iso_639/bn_IN.po
Normal file
33998
setup/iso_639/bn_IN.po
Normal file
File diff suppressed because it is too large
Load Diff
30899
setup/iso_639/br.po
Normal file
30899
setup/iso_639/br.po
Normal file
File diff suppressed because it is too large
Load Diff
33078
setup/iso_639/bs.po
Normal file
33078
setup/iso_639/bs.po
Normal file
File diff suppressed because it is too large
Load Diff
33173
setup/iso_639/byn.po
Normal file
33173
setup/iso_639/byn.po
Normal file
File diff suppressed because it is too large
Load Diff
33938
setup/iso_639/ca.po
Normal file
33938
setup/iso_639/ca.po
Normal file
File diff suppressed because it is too large
Load Diff
30926
setup/iso_639/crh.po
Normal file
30926
setup/iso_639/crh.po
Normal file
File diff suppressed because it is too large
Load Diff
33542
setup/iso_639/cs.po
Normal file
33542
setup/iso_639/cs.po
Normal file
File diff suppressed because it is too large
Load Diff
34256
setup/iso_639/cy.po
Normal file
34256
setup/iso_639/cy.po
Normal file
File diff suppressed because it is too large
Load Diff
34952
setup/iso_639/da.po
Normal file
34952
setup/iso_639/da.po
Normal file
File diff suppressed because it is too large
Load Diff
35241
setup/iso_639/de.po
Normal file
35241
setup/iso_639/de.po
Normal file
File diff suppressed because it is too large
Load Diff
32435
setup/iso_639/el.po
Normal file
32435
setup/iso_639/el.po
Normal file
File diff suppressed because it is too large
Load Diff
34969
setup/iso_639/eo.po
Normal file
34969
setup/iso_639/eo.po
Normal file
File diff suppressed because it is too large
Load Diff
34978
setup/iso_639/es.po
Normal file
34978
setup/iso_639/es.po
Normal file
File diff suppressed because it is too large
Load Diff
33392
setup/iso_639/et.po
Normal file
33392
setup/iso_639/et.po
Normal file
File diff suppressed because it is too large
Load Diff
33195
setup/iso_639/eu.po
Normal file
33195
setup/iso_639/eu.po
Normal file
File diff suppressed because it is too large
Load Diff
34699
setup/iso_639/fa.po
Normal file
34699
setup/iso_639/fa.po
Normal file
File diff suppressed because it is too large
Load Diff
35015
setup/iso_639/fi.po
Normal file
35015
setup/iso_639/fi.po
Normal file
File diff suppressed because it is too large
Load Diff
30860
setup/iso_639/fr.po
Normal file
30860
setup/iso_639/fr.po
Normal file
File diff suppressed because it is too large
Load Diff
35031
setup/iso_639/ga.po
Normal file
35031
setup/iso_639/ga.po
Normal file
File diff suppressed because it is too large
Load Diff
33173
setup/iso_639/gez.po
Normal file
33173
setup/iso_639/gez.po
Normal file
File diff suppressed because it is too large
Load Diff
30865
setup/iso_639/gl.po
Normal file
30865
setup/iso_639/gl.po
Normal file
File diff suppressed because it is too large
Load Diff
30893
setup/iso_639/gu.po
Normal file
30893
setup/iso_639/gu.po
Normal file
File diff suppressed because it is too large
Load Diff
33228
setup/iso_639/he.po
Normal file
33228
setup/iso_639/he.po
Normal file
File diff suppressed because it is too large
Load Diff
33178
setup/iso_639/hi.po
Normal file
33178
setup/iso_639/hi.po
Normal file
File diff suppressed because it is too large
Load Diff
33298
setup/iso_639/hr.po
Normal file
33298
setup/iso_639/hr.po
Normal file
File diff suppressed because it is too large
Load Diff
35252
setup/iso_639/hu.po
Normal file
35252
setup/iso_639/hu.po
Normal file
File diff suppressed because it is too large
Load Diff
34070
setup/iso_639/id.po
Normal file
34070
setup/iso_639/id.po
Normal file
File diff suppressed because it is too large
Load Diff
33229
setup/iso_639/is.po
Normal file
33229
setup/iso_639/is.po
Normal file
File diff suppressed because it is too large
Load Diff
39178
setup/iso_639/iso_639_3.xml
Normal file
39178
setup/iso_639/iso_639_3.xml
Normal file
File diff suppressed because it is too large
Load Diff
30851
setup/iso_639/it.po
Normal file
30851
setup/iso_639/it.po
Normal file
File diff suppressed because it is too large
Load Diff
43196
setup/iso_639/ja.po
Normal file
43196
setup/iso_639/ja.po
Normal file
File diff suppressed because it is too large
Load Diff
30927
setup/iso_639/kn.po
Normal file
30927
setup/iso_639/kn.po
Normal file
File diff suppressed because it is too large
Load Diff
33380
setup/iso_639/ko.po
Normal file
33380
setup/iso_639/ko.po
Normal file
File diff suppressed because it is too large
Load Diff
33153
setup/iso_639/kok.po
Normal file
33153
setup/iso_639/kok.po
Normal file
File diff suppressed because it is too large
Load Diff
30897
setup/iso_639/lt.po
Normal file
30897
setup/iso_639/lt.po
Normal file
File diff suppressed because it is too large
Load Diff
33227
setup/iso_639/lv.po
Normal file
33227
setup/iso_639/lv.po
Normal file
File diff suppressed because it is too large
Load Diff
33247
setup/iso_639/mi.po
Normal file
33247
setup/iso_639/mi.po
Normal file
File diff suppressed because it is too large
Load Diff
33236
setup/iso_639/mk.po
Normal file
33236
setup/iso_639/mk.po
Normal file
File diff suppressed because it is too large
Load Diff
34585
setup/iso_639/mn.po
Normal file
34585
setup/iso_639/mn.po
Normal file
File diff suppressed because it is too large
Load Diff
30867
setup/iso_639/mr.po
Normal file
30867
setup/iso_639/mr.po
Normal file
File diff suppressed because it is too large
Load Diff
33374
setup/iso_639/ms.po
Normal file
33374
setup/iso_639/ms.po
Normal file
File diff suppressed because it is too large
Load Diff
34967
setup/iso_639/mt.po
Normal file
34967
setup/iso_639/mt.po
Normal file
File diff suppressed because it is too large
Load Diff
34114
setup/iso_639/nb.po
Normal file
34114
setup/iso_639/nb.po
Normal file
File diff suppressed because it is too large
Load Diff
34201
setup/iso_639/nl.po
Normal file
34201
setup/iso_639/nl.po
Normal file
File diff suppressed because it is too large
Load Diff
33810
setup/iso_639/nn.po
Normal file
33810
setup/iso_639/nn.po
Normal file
File diff suppressed because it is too large
Load Diff
33062
setup/iso_639/nso.po
Normal file
33062
setup/iso_639/nso.po
Normal file
File diff suppressed because it is too large
Load Diff
30863
setup/iso_639/oc.po
Normal file
30863
setup/iso_639/oc.po
Normal file
File diff suppressed because it is too large
Load Diff
30863
setup/iso_639/or.po
Normal file
30863
setup/iso_639/or.po
Normal file
File diff suppressed because it is too large
Load Diff
30932
setup/iso_639/pa.po
Normal file
30932
setup/iso_639/pa.po
Normal file
File diff suppressed because it is too large
Load Diff
30850
setup/iso_639/pl.po
Normal file
30850
setup/iso_639/pl.po
Normal file
File diff suppressed because it is too large
Load Diff
31615
setup/iso_639/ps.po
Normal file
31615
setup/iso_639/ps.po
Normal file
File diff suppressed because it is too large
Load Diff
34936
setup/iso_639/pt.po
Normal file
34936
setup/iso_639/pt.po
Normal file
File diff suppressed because it is too large
Load Diff
32497
setup/iso_639/pt_BR.po
Normal file
32497
setup/iso_639/pt_BR.po
Normal file
File diff suppressed because it is too large
Load Diff
33391
setup/iso_639/ro.po
Normal file
33391
setup/iso_639/ro.po
Normal file
File diff suppressed because it is too large
Load Diff
34947
setup/iso_639/ru.po
Normal file
34947
setup/iso_639/ru.po
Normal file
File diff suppressed because it is too large
Load Diff
34991
setup/iso_639/rw.po
Normal file
34991
setup/iso_639/rw.po
Normal file
File diff suppressed because it is too large
Load Diff
33228
setup/iso_639/sk.po
Normal file
33228
setup/iso_639/sk.po
Normal file
File diff suppressed because it is too large
Load Diff
34187
setup/iso_639/sl.po
Normal file
34187
setup/iso_639/sl.po
Normal file
File diff suppressed because it is too large
Load Diff
35064
setup/iso_639/sr.po
Normal file
35064
setup/iso_639/sr.po
Normal file
File diff suppressed because it is too large
Load Diff
35064
setup/iso_639/sr@latin.po
Normal file
35064
setup/iso_639/sr@latin.po
Normal file
File diff suppressed because it is too large
Load Diff
35702
setup/iso_639/sv.po
Normal file
35702
setup/iso_639/sv.po
Normal file
File diff suppressed because it is too large
Load Diff
30933
setup/iso_639/ta.po
Normal file
30933
setup/iso_639/ta.po
Normal file
File diff suppressed because it is too large
Load Diff
30938
setup/iso_639/th.po
Normal file
30938
setup/iso_639/th.po
Normal file
File diff suppressed because it is too large
Load Diff
33173
setup/iso_639/ti.po
Normal file
33173
setup/iso_639/ti.po
Normal file
File diff suppressed because it is too large
Load Diff
33173
setup/iso_639/tig.po
Normal file
33173
setup/iso_639/tig.po
Normal file
File diff suppressed because it is too large
Load Diff
34572
setup/iso_639/tr.po
Normal file
34572
setup/iso_639/tr.po
Normal file
File diff suppressed because it is too large
Load Diff
33506
setup/iso_639/tt.po
Normal file
33506
setup/iso_639/tt.po
Normal file
File diff suppressed because it is too large
Load Diff
30851
setup/iso_639/uk.po
Normal file
30851
setup/iso_639/uk.po
Normal file
File diff suppressed because it is too large
Load Diff
34473
setup/iso_639/ve.po
Normal file
34473
setup/iso_639/ve.po
Normal file
File diff suppressed because it is too large
Load Diff
34958
setup/iso_639/vi.po
Normal file
34958
setup/iso_639/vi.po
Normal file
File diff suppressed because it is too large
Load Diff
34192
setup/iso_639/wa.po
Normal file
34192
setup/iso_639/wa.po
Normal file
File diff suppressed because it is too large
Load Diff
33142
setup/iso_639/xh.po
Normal file
33142
setup/iso_639/xh.po
Normal file
File diff suppressed because it is too large
Load Diff
33214
setup/iso_639/zh_CN.po
Normal file
33214
setup/iso_639/zh_CN.po
Normal file
File diff suppressed because it is too large
Load Diff
33204
setup/iso_639/zh_TW.po
Normal file
33204
setup/iso_639/zh_TW.po
Normal file
File diff suppressed because it is too large
Load Diff
33228
setup/iso_639/zu.po
Normal file
33228
setup/iso_639/zu.po
Normal file
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,6 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, tempfile, shutil, subprocess, glob, re, time, textwrap
|
import os, tempfile, shutil, subprocess, glob, re, time, textwrap
|
||||||
from distutils import sysconfig
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from setup import Command, __appname__, __version__
|
from setup import Command, __appname__, __version__
|
||||||
@ -142,19 +141,18 @@ class Translations(POT): # {{{
|
|||||||
os.makedirs(base)
|
os.makedirs(base)
|
||||||
self.info('\tCompiling translations for', locale)
|
self.info('\tCompiling translations for', locale)
|
||||||
subprocess.check_call(['msgfmt', '-o', dest, f])
|
subprocess.check_call(['msgfmt', '-o', dest, f])
|
||||||
if locale in ('en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc', 'ltg', 'nds', 'te', 'yi'):
|
iscpo = {'bn':'bn_IN', 'zh_HK':'zh_CN'}.get(locale, locale)
|
||||||
continue
|
iso639 = self.j(self.d(self.SRC), 'setup', 'iso_639',
|
||||||
pycountry = self.j(sysconfig.get_python_lib(), 'pycountry',
|
'%s.po'%iscpo)
|
||||||
'locales', locale, 'LC_MESSAGES')
|
|
||||||
if os.path.exists(pycountry):
|
if os.path.exists(iso639):
|
||||||
iso639 = self.j(pycountry, 'iso639.mo')
|
dest = self.j(self.d(dest), 'iso639.mo')
|
||||||
dest = self.j(self.d(dest), self.b(iso639))
|
if self.newer(dest, iso639):
|
||||||
if self.newer(dest, iso639) and os.path.exists(iso639):
|
|
||||||
self.info('\tCopying ISO 639 translations')
|
self.info('\tCopying ISO 639 translations')
|
||||||
shutil.copy2(iso639, dest)
|
subprocess.check_call(['msgfmt', '-o', dest, iso639])
|
||||||
else:
|
elif locale not in ('en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc',
|
||||||
self.warn('No ISO 639 translations for locale:', locale,
|
'ltg', 'nds', 'te', 'yi', 'fo', 'sq', 'ast', 'ml'):
|
||||||
'\nDo you have pycountry installed?')
|
self.warn('No ISO 639 translations for locale:', locale)
|
||||||
|
|
||||||
self.write_stats()
|
self.write_stats()
|
||||||
self.freeze_locales()
|
self.freeze_locales()
|
||||||
@ -212,7 +210,7 @@ class Translations(POT): # {{{
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
class GetTranslations(Translations):
|
class GetTranslations(Translations): # {{{
|
||||||
|
|
||||||
description = 'Get updated translations from Launchpad'
|
description = 'Get updated translations from Launchpad'
|
||||||
BRANCH = 'lp:~kovid/calibre/translations'
|
BRANCH = 'lp:~kovid/calibre/translations'
|
||||||
@ -273,24 +271,25 @@ class GetTranslations(Translations):
|
|||||||
subprocess.check_call(['bzr', 'commit', '-m',
|
subprocess.check_call(['bzr', 'commit', '-m',
|
||||||
'IGN:Translation corrections', cls.PATH])
|
'IGN:Translation corrections', cls.PATH])
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
class ISO639(Command):
|
class ISO639(Command): # {{{
|
||||||
|
|
||||||
description = 'Compile translations for ISO 639 codes'
|
description = 'Compile translations for ISO 639 codes'
|
||||||
DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization',
|
DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization',
|
||||||
'iso639.pickle')
|
'iso639.pickle')
|
||||||
|
|
||||||
def run(self, opts):
|
def run(self, opts):
|
||||||
src = self.j(self.d(self.SRC), 'setup', 'iso639.xml')
|
src = self.j(self.d(self.SRC), 'setup', 'iso_639')
|
||||||
if not os.path.exists(src):
|
if not os.path.exists(src):
|
||||||
raise Exception(src + ' does not exist')
|
raise Exception(src + ' does not exist')
|
||||||
dest = self.DEST
|
dest = self.DEST
|
||||||
if not self.newer(dest, src):
|
if not self.newer(dest, [src, __file__]):
|
||||||
self.info('Pickled code is up to date')
|
self.info('Pickled code is up to date')
|
||||||
return
|
return
|
||||||
self.info('Pickling ISO-639 codes to', dest)
|
self.info('Pickling ISO-639 codes to', dest)
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
root = etree.fromstring(open(src, 'rb').read())
|
root = etree.fromstring(open(self.j(src, 'iso_639_3.xml'), 'rb').read())
|
||||||
by_2 = {}
|
by_2 = {}
|
||||||
by_3b = {}
|
by_3b = {}
|
||||||
by_3t = {}
|
by_3t = {}
|
||||||
@ -298,12 +297,16 @@ class ISO639(Command):
|
|||||||
m3to2 = {}
|
m3to2 = {}
|
||||||
m3bto3t = {}
|
m3bto3t = {}
|
||||||
nm = {}
|
nm = {}
|
||||||
codes2, codes3t, codes3b = set([]), set([]), set([])
|
codes2, codes3t, codes3b = set(), set(), set()
|
||||||
for x in root.xpath('//iso_639_entry'):
|
for x in root.xpath('//iso_639_3_entry'):
|
||||||
|
two = x.get('part1_code', None)
|
||||||
|
threet = x.get('id')
|
||||||
|
threeb = x.get('part2_code', None)
|
||||||
|
if threeb is None:
|
||||||
|
# Only recognize langauges in ISO-639-2
|
||||||
|
continue
|
||||||
name = x.get('name')
|
name = x.get('name')
|
||||||
two = x.get('iso_639_1_code', None)
|
|
||||||
threeb = x.get('iso_639_2B_code')
|
|
||||||
threet = x.get('iso_639_2T_code')
|
|
||||||
if two is not None:
|
if two is not None:
|
||||||
by_2[two] = name
|
by_2[two] = name
|
||||||
codes2.add(two)
|
codes2.add(two)
|
||||||
@ -313,13 +316,10 @@ class ISO639(Command):
|
|||||||
by_3t[threet] = name
|
by_3t[threet] = name
|
||||||
if threeb != threet:
|
if threeb != threet:
|
||||||
m3bto3t[threeb] = threet
|
m3bto3t[threeb] = threet
|
||||||
codes3b.add(x.get('iso_639_2B_code'))
|
codes3b.add(threeb)
|
||||||
codes3t.add(x.get('iso_639_2T_code'))
|
codes3t.add(threet)
|
||||||
base_name = name.lower()
|
base_name = name.lower()
|
||||||
nm[base_name] = threet
|
nm[base_name] = threet
|
||||||
simple_name = base_name.partition(';')[0].strip()
|
|
||||||
if simple_name not in nm:
|
|
||||||
nm[simple_name] = threet
|
|
||||||
|
|
||||||
from cPickle import dump
|
from cPickle import dump
|
||||||
x = {'by_2':by_2, 'by_3b':by_3b, 'by_3t':by_3t, 'codes2':codes2,
|
x = {'by_2':by_2, 'by_3b':by_3b, 'by_3t':by_3t, 'codes2':codes2,
|
||||||
@ -331,4 +331,5 @@ class ISO639(Command):
|
|||||||
if os.path.exists(self.DEST):
|
if os.path.exists(self.DEST):
|
||||||
os.remove(self.DEST)
|
os.remove(self.DEST)
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
@ -19,11 +19,12 @@ class ANDROID(USBMS):
|
|||||||
|
|
||||||
VENDOR_ID = {
|
VENDOR_ID = {
|
||||||
# HTC
|
# HTC
|
||||||
0x0bb4 : { 0xc02 : [0x100, 0x0227, 0x0226, 0x222],
|
0x0bb4 : { 0xc02 : [0x100, 0x0227, 0x0226, 0x222],
|
||||||
0xc01 : [0x100, 0x0227, 0x0226],
|
0xc01 : [0x100, 0x0227, 0x0226],
|
||||||
0xff9 : [0x0100, 0x0227, 0x0226],
|
0xff9 : [0x0100, 0x0227, 0x0226],
|
||||||
0xc87 : [0x0100, 0x0227, 0x0226],
|
0xc86 : [0x100, 0x0227, 0x0226, 0x222],
|
||||||
0xc91 : [0x0100, 0x0227, 0x0226],
|
0xc87 : [0x0100, 0x0227, 0x0226],
|
||||||
|
0xc91 : [0x0100, 0x0227, 0x0226],
|
||||||
0xc92 : [0x100, 0x0227, 0x0226, 0x222],
|
0xc92 : [0x100, 0x0227, 0x0226, 0x222],
|
||||||
0xc97 : [0x100, 0x0227, 0x0226, 0x222],
|
0xc97 : [0x100, 0x0227, 0x0226, 0x222],
|
||||||
0xc99 : [0x100, 0x0227, 0x0226, 0x222],
|
0xc99 : [0x100, 0x0227, 0x0226, 0x222],
|
||||||
@ -124,7 +125,7 @@ class ANDROID(USBMS):
|
|||||||
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
|
VENDOR_NAME = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
|
||||||
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
|
'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS',
|
||||||
'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA',
|
'TELECHIP', 'HUAWEI', 'T-MOBILE', 'SEMC', 'LGE', 'NVIDIA',
|
||||||
'GENERIC-', 'ZTE', 'MID']
|
'GENERIC-', 'ZTE', 'MID', 'QUALCOMM']
|
||||||
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
|
||||||
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
'__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
|
||||||
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
|
'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
|
||||||
|
@ -85,7 +85,8 @@ class Serializer(object):
|
|||||||
spine_item(item).is_section_start = True
|
spine_item(item).is_section_start = True
|
||||||
for i, article in enumerate(articles):
|
for i, article in enumerate(articles):
|
||||||
si = spine_item(article)
|
si = spine_item(article)
|
||||||
si.is_article_start = True
|
if si is not None:
|
||||||
|
si.is_article_start = True
|
||||||
|
|
||||||
items = list(self.oeb.spine)
|
items = list(self.oeb.spine)
|
||||||
in_sec = in_art = False
|
in_sec = in_art = False
|
||||||
@ -116,7 +117,7 @@ class Serializer(object):
|
|||||||
buf.write(b'</html>')
|
buf.write(b'</html>')
|
||||||
self.end_offset = buf.tell()
|
self.end_offset = buf.tell()
|
||||||
self.fixup_links()
|
self.fixup_links()
|
||||||
if self.start_offset is None:
|
if self.start_offset is None and not self.is_periodical:
|
||||||
# If we don't set a start offset, the stupid Kindle will
|
# If we don't set a start offset, the stupid Kindle will
|
||||||
# open the book at the location of the first IndexEntry, which
|
# open the book at the location of the first IndexEntry, which
|
||||||
# could be anywhere. So ensure the book is always opened at the
|
# could be anywhere. So ensure the book is always opened at the
|
||||||
|
@ -885,7 +885,8 @@ class Manifest(object):
|
|||||||
try:
|
try:
|
||||||
data = etree.fromstring(data, parser=parser)
|
data = etree.fromstring(data, parser=parser)
|
||||||
except etree.XMLSyntaxError as err:
|
except etree.XMLSyntaxError as err:
|
||||||
self.oeb.log.exception('Initial parse failed:')
|
self.oeb.log.debug('Initial parse failed, using more'
|
||||||
|
' forgiving parsers')
|
||||||
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
||||||
data = ENTITY_RE.sub(repl, data)
|
data = ENTITY_RE.sub(repl, data)
|
||||||
try:
|
try:
|
||||||
|
37
src/calibre/ebooks/readability/README.txt
Normal file
37
src/calibre/ebooks/readability/README.txt
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
This is a python port of a ruby port of arc90's readability project, taken
|
||||||
|
from https://github.com/buriy/python-readability
|
||||||
|
|
||||||
|
The original readability project:
|
||||||
|
http://lab.arc90.com/experiments/readability/
|
||||||
|
|
||||||
|
In few words,
|
||||||
|
Given a html document, it pulls out the main body text and cleans it up.
|
||||||
|
It also can clean up title based on latest readability.js code.
|
||||||
|
|
||||||
|
Based on:
|
||||||
|
- Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js )
|
||||||
|
- Ruby port by starrhorne and iterationlabs
|
||||||
|
- Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup )
|
||||||
|
- Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ )
|
||||||
|
- "BR to P" fix from readability.js which improves quality for smaller texts.
|
||||||
|
- Github users contributions.
|
||||||
|
|
||||||
|
Installation::
|
||||||
|
|
||||||
|
easy_install readability-lxml
|
||||||
|
or
|
||||||
|
pip install readability-lxml
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
from readability.readability import Document
|
||||||
|
import urllib
|
||||||
|
html = urllib.urlopen(url).read()
|
||||||
|
readable_article = Document(html).summary()
|
||||||
|
readable_title = Document(html).short_title()
|
||||||
|
|
||||||
|
Command-line usage::
|
||||||
|
|
||||||
|
python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
|
1
src/calibre/ebooks/readability/__init__.py
Normal file
1
src/calibre/ebooks/readability/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
|
32
src/calibre/ebooks/readability/cleaners.py
Normal file
32
src/calibre/ebooks/readability/cleaners.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
|
||||||
|
import re
|
||||||
|
from lxml.html.clean import Cleaner
|
||||||
|
|
||||||
|
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
|
||||||
|
single_quoted = "'[^']+'"
|
||||||
|
double_quoted = '"[^"]+"'
|
||||||
|
non_space = '[^ "\'>]+'
|
||||||
|
htmlstrip = re.compile("<" # open
|
||||||
|
"([^>]+) " # prefix
|
||||||
|
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
||||||
|
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
||||||
|
"([^>]*)" # postfix
|
||||||
|
">" # end
|
||||||
|
, re.I)
|
||||||
|
|
||||||
|
def clean_attributes(html):
|
||||||
|
while htmlstrip.search(html):
|
||||||
|
html = htmlstrip.sub('<\\1\\2>', html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def normalize_spaces(s):
|
||||||
|
if not s: return ''
|
||||||
|
"""replace any sequence of whitespace
|
||||||
|
characters with a single space"""
|
||||||
|
return ' '.join(s.split())
|
||||||
|
|
||||||
|
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
||||||
|
style=True, links=True, meta=False, add_nofollow=False,
|
||||||
|
page_structure=False, processing_instructions=True, embedded=False,
|
||||||
|
frames=False, forms=False, annoying_tags=False, remove_tags=None,
|
||||||
|
remove_unknown_tags=False, safe_attrs_only=False)
|
25
src/calibre/ebooks/readability/debug.py
Normal file
25
src/calibre/ebooks/readability/debug.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
def save_to_file(text, filename):
|
||||||
|
f = open(filename, 'wt')
|
||||||
|
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||||
|
f.write(text.encode('utf-8'))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
uids = {}
|
||||||
|
def describe(node, depth=2):
|
||||||
|
if not hasattr(node, 'tag'):
|
||||||
|
return "[%s]" % type(node)
|
||||||
|
name = node.tag
|
||||||
|
if node.get('id', ''): name += '#'+node.get('id')
|
||||||
|
if node.get('class', ''):
|
||||||
|
name += '.' + node.get('class').replace(' ','.')
|
||||||
|
if name[:4] in ['div#', 'div.']:
|
||||||
|
name = name[3:]
|
||||||
|
if name in ['tr', 'td', 'div', 'p']:
|
||||||
|
if not node in uids:
|
||||||
|
uid = uids[node] = len(uids)+1
|
||||||
|
else:
|
||||||
|
uid = uids.get(node)
|
||||||
|
name += "%02d" % (uid)
|
||||||
|
if depth and node.getparent() is not None:
|
||||||
|
return name+' - '+describe(node.getparent(), depth-1)
|
||||||
|
return name
|
103
src/calibre/ebooks/readability/htmls.py
Normal file
103
src/calibre/ebooks/readability/htmls.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from lxml.html import tostring
|
||||||
|
import lxml.html
|
||||||
|
|
||||||
|
from calibre.ebooks.readability.cleaners import normalize_spaces, clean_attributes
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
|
def build_doc(page):
|
||||||
|
page_unicode = xml_to_unicode(page, strip_encoding_pats=True)[0]
|
||||||
|
doc = lxml.html.document_fromstring(page_unicode)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def js_re(src, pattern, flags, repl):
|
||||||
|
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_entities(cur_title):
|
||||||
|
entities = {
|
||||||
|
u'\u2014':'-',
|
||||||
|
u'\u2013':'-',
|
||||||
|
u'—': '-',
|
||||||
|
u'–': '-',
|
||||||
|
u'\u00A0': ' ',
|
||||||
|
u'\u00AB': '"',
|
||||||
|
u'\u00BB': '"',
|
||||||
|
u'"': '"',
|
||||||
|
}
|
||||||
|
for c, r in entities.iteritems():
|
||||||
|
if c in cur_title:
|
||||||
|
cur_title = cur_title.replace(c, r)
|
||||||
|
|
||||||
|
return cur_title
|
||||||
|
|
||||||
|
def norm_title(title):
|
||||||
|
return normalize_entities(normalize_spaces(title))
|
||||||
|
|
||||||
|
def get_title(doc):
|
||||||
|
title = doc.find('.//title').text
|
||||||
|
if not title:
|
||||||
|
return '[no-title]'
|
||||||
|
|
||||||
|
return norm_title(title)
|
||||||
|
|
||||||
|
def add_match(collection, text, orig):
|
||||||
|
text = norm_title(text)
|
||||||
|
if len(text.split()) >= 2 and len(text) >= 15:
|
||||||
|
if text.replace('"', '') in orig.replace('"', ''):
|
||||||
|
collection.add(text)
|
||||||
|
|
||||||
|
def shorten_title(doc):
|
||||||
|
title = doc.find('.//title').text
|
||||||
|
if not title:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
title = orig = norm_title(title)
|
||||||
|
|
||||||
|
candidates = set()
|
||||||
|
|
||||||
|
for item in ['.//h1', './/h2', './/h3']:
|
||||||
|
for e in list(doc.iterfind(item)):
|
||||||
|
if e.text:
|
||||||
|
add_match(candidates, e.text, orig)
|
||||||
|
if e.text_content():
|
||||||
|
add_match(candidates, e.text_content(), orig)
|
||||||
|
|
||||||
|
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
|
||||||
|
for e in doc.cssselect(item):
|
||||||
|
if e.text:
|
||||||
|
add_match(candidates, e.text, orig)
|
||||||
|
if e.text_content():
|
||||||
|
add_match(candidates, e.text_content(), orig)
|
||||||
|
|
||||||
|
if candidates:
|
||||||
|
title = sorted(candidates, key=len)[-1]
|
||||||
|
else:
|
||||||
|
for delimiter in [' | ', ' - ', ' :: ', ' / ']:
|
||||||
|
if delimiter in title:
|
||||||
|
parts = orig.split(delimiter)
|
||||||
|
if len(parts[0].split()) >= 4:
|
||||||
|
title = parts[0]
|
||||||
|
break
|
||||||
|
elif len(parts[-1].split()) >= 4:
|
||||||
|
title = parts[-1]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if ': ' in title:
|
||||||
|
parts = orig.split(': ')
|
||||||
|
if len(parts[-1].split()) >= 4:
|
||||||
|
title = parts[-1]
|
||||||
|
else:
|
||||||
|
title = orig.split(': ', 1)[1]
|
||||||
|
|
||||||
|
if not 15 < len(title) < 150:
|
||||||
|
return orig
|
||||||
|
|
||||||
|
return title
|
||||||
|
|
||||||
|
def get_body(doc):
|
||||||
|
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
|
||||||
|
raw_html = unicode(tostring(doc.body or doc))
|
||||||
|
return clean_attributes(raw_html)
|
||||||
|
|
513
src/calibre/ebooks/readability/readability.py
Normal file
513
src/calibre/ebooks/readability/readability.py
Normal file
@ -0,0 +1,513 @@
|
|||||||
|
import re, sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from lxml.etree import tostring
|
||||||
|
from lxml.html import (fragment_fromstring, document_fromstring,
|
||||||
|
tostring as htostring)
|
||||||
|
|
||||||
|
from calibre.ebooks.readability.htmls import build_doc, get_body, get_title, shorten_title
|
||||||
|
from calibre.ebooks.readability.cleaners import html_cleaner, clean_attributes
|
||||||
|
|
||||||
|
def tounicode(tree_or_node, **kwargs):
|
||||||
|
kwargs['encoding'] = unicode
|
||||||
|
return htostring(tree_or_node, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
REGEXES = {
|
||||||
|
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
|
||||||
|
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
|
||||||
|
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
|
||||||
|
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
|
||||||
|
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
|
||||||
|
#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
|
||||||
|
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
|
||||||
|
#'trimRe': re.compile('^\s+|\s+$/'),
|
||||||
|
#'normalizeRe': re.compile('\s{2,}/'),
|
||||||
|
#'killBreaksRe': re.compile('(<br\s*\/?>(\s| ?)*){1,}/'),
|
||||||
|
#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
|
||||||
|
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
|
||||||
|
}
|
||||||
|
|
||||||
|
def describe(node, depth=1):
|
||||||
|
if not hasattr(node, 'tag'):
|
||||||
|
return "[%s]" % type(node)
|
||||||
|
name = node.tag
|
||||||
|
if node.get('id', ''): name += '#'+node.get('id')
|
||||||
|
if node.get('class', ''):
|
||||||
|
name += '.' + node.get('class').replace(' ','.')
|
||||||
|
if name[:4] in ['div#', 'div.']:
|
||||||
|
name = name[3:]
|
||||||
|
if depth and node.getparent() is not None:
|
||||||
|
return name+' - '+describe(node.getparent(), depth-1)
|
||||||
|
return name
|
||||||
|
|
||||||
|
def to_int(x):
|
||||||
|
if not x: return None
|
||||||
|
x = x.strip()
|
||||||
|
if x.endswith('px'):
|
||||||
|
return int(x[:-2])
|
||||||
|
if x.endswith('em'):
|
||||||
|
return int(x[:-2]) * 12
|
||||||
|
return int(x)
|
||||||
|
|
||||||
|
def clean(text):
|
||||||
|
text = re.sub('\s*\n\s*', '\n', text)
|
||||||
|
text = re.sub('[ \t]{2,}', ' ', text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def text_length(i):
|
||||||
|
return len(clean(i.text_content() or ""))
|
||||||
|
|
||||||
|
class Unparseable(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Document:
|
||||||
|
TEXT_LENGTH_THRESHOLD = 25
|
||||||
|
RETRY_LENGTH = 250
|
||||||
|
|
||||||
|
def __init__(self, input, log, **options):
|
||||||
|
self.input = input
|
||||||
|
self.options = defaultdict(lambda: None)
|
||||||
|
for k, v in options.items():
|
||||||
|
self.options[k] = v
|
||||||
|
self.html = None
|
||||||
|
self.log = log
|
||||||
|
|
||||||
|
def _html(self, force=False):
|
||||||
|
if force or self.html is None:
|
||||||
|
self.html = self._parse(self.input)
|
||||||
|
return self.html
|
||||||
|
|
||||||
|
def _parse(self, input):
|
||||||
|
doc = build_doc(input)
|
||||||
|
doc = html_cleaner.clean_html(doc)
|
||||||
|
base_href = self.options['url']
|
||||||
|
if base_href:
|
||||||
|
doc.make_links_absolute(base_href, resolve_base_href=True)
|
||||||
|
else:
|
||||||
|
doc.resolve_base_href()
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def content(self):
|
||||||
|
return get_body(self._html(True))
|
||||||
|
|
||||||
|
def title(self):
|
||||||
|
return get_title(self._html(True))
|
||||||
|
|
||||||
|
def short_title(self):
|
||||||
|
return shorten_title(self._html(True))
|
||||||
|
|
||||||
|
def summary(self):
|
||||||
|
try:
|
||||||
|
ruthless = True
|
||||||
|
while True:
|
||||||
|
self._html(True)
|
||||||
|
|
||||||
|
for i in self.tags(self.html, 'script', 'style'):
|
||||||
|
i.drop_tree()
|
||||||
|
for i in self.tags(self.html, 'body'):
|
||||||
|
i.set('id', 'readabilityBody')
|
||||||
|
if ruthless:
|
||||||
|
self.remove_unlikely_candidates()
|
||||||
|
self.transform_misused_divs_into_paragraphs()
|
||||||
|
candidates = self.score_paragraphs()
|
||||||
|
|
||||||
|
best_candidate = self.select_best_candidate(candidates)
|
||||||
|
if best_candidate:
|
||||||
|
article = self.get_article(candidates, best_candidate)
|
||||||
|
else:
|
||||||
|
if ruthless:
|
||||||
|
self.log.debug("ruthless removal did not work. ")
|
||||||
|
ruthless = False
|
||||||
|
self.debug("ended up stripping too much - going for a safer _parse")
|
||||||
|
# try again
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
self.log.debug("Ruthless and lenient parsing did not work. Returning raw html")
|
||||||
|
article = self.html.find('body')
|
||||||
|
if article is None:
|
||||||
|
article = self.html
|
||||||
|
|
||||||
|
cleaned_article = self.sanitize(article, candidates)
|
||||||
|
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
|
||||||
|
if ruthless and not of_acceptable_length:
|
||||||
|
ruthless = False
|
||||||
|
continue # try again
|
||||||
|
else:
|
||||||
|
return cleaned_article
|
||||||
|
except StandardError, e:
|
||||||
|
self.log.exception('error getting summary: ' )
|
||||||
|
raise Unparseable(str(e)), None, sys.exc_info()[2]
|
||||||
|
|
||||||
|
def get_article(self, candidates, best_candidate):
|
||||||
|
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
|
# Things like preambles, content split by ads that we removed, etc.
|
||||||
|
|
||||||
|
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
|
||||||
|
output = document_fromstring('<div/>')
|
||||||
|
parent = output.xpath('//div')[0]
|
||||||
|
best_elem = best_candidate['elem']
|
||||||
|
for sibling in best_elem.getparent().getchildren():
|
||||||
|
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
|
||||||
|
append = False
|
||||||
|
if sibling is best_elem:
|
||||||
|
append = True
|
||||||
|
sibling_key = sibling #HashableElement(sibling)
|
||||||
|
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
|
||||||
|
append = True
|
||||||
|
|
||||||
|
if sibling.tag == "p":
|
||||||
|
link_density = self.get_link_density(sibling)
|
||||||
|
node_content = sibling.text or ""
|
||||||
|
node_length = len(node_content)
|
||||||
|
|
||||||
|
if node_length > 80 and link_density < 0.25:
|
||||||
|
append = True
|
||||||
|
elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
|
||||||
|
append = True
|
||||||
|
|
||||||
|
if append:
|
||||||
|
parent.append(sibling)
|
||||||
|
#if output is not None:
|
||||||
|
# output.append(best_elem)
|
||||||
|
return output.find('body')
|
||||||
|
|
||||||
|
def select_best_candidate(self, candidates):
|
||||||
|
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
||||||
|
for candidate in sorted_candidates[:5]:
|
||||||
|
elem = candidate['elem']
|
||||||
|
self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
|
||||||
|
|
||||||
|
if len(sorted_candidates) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
best_candidate = sorted_candidates[0]
|
||||||
|
return best_candidate
|
||||||
|
|
||||||
|
|
||||||
|
def get_link_density(self, elem):
|
||||||
|
link_length = 0
|
||||||
|
for i in elem.findall(".//a"):
|
||||||
|
link_length += text_length(i)
|
||||||
|
#if len(elem.findall(".//div") or elem.findall(".//p")):
|
||||||
|
# link_length = link_length
|
||||||
|
total_length = text_length(elem)
|
||||||
|
return float(link_length) / max(total_length, 1)
|
||||||
|
|
||||||
|
def score_paragraphs(self, ):
|
||||||
|
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
|
||||||
|
candidates = {}
|
||||||
|
#self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
|
||||||
|
|
||||||
|
ordered = []
|
||||||
|
for elem in self.tags(self.html, "p", "pre", "td"):
|
||||||
|
parent_node = elem.getparent()
|
||||||
|
if parent_node is None:
|
||||||
|
continue
|
||||||
|
grand_parent_node = parent_node.getparent()
|
||||||
|
|
||||||
|
inner_text = clean(elem.text_content() or "")
|
||||||
|
inner_text_len = len(inner_text)
|
||||||
|
|
||||||
|
# If this paragraph is less than 25 characters, don't even count it.
|
||||||
|
if inner_text_len < MIN_LEN:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if parent_node not in candidates:
|
||||||
|
candidates[parent_node] = self.score_node(parent_node)
|
||||||
|
ordered.append(parent_node)
|
||||||
|
|
||||||
|
if grand_parent_node is not None and grand_parent_node not in candidates:
|
||||||
|
candidates[grand_parent_node] = self.score_node(grand_parent_node)
|
||||||
|
ordered.append(grand_parent_node)
|
||||||
|
|
||||||
|
content_score = 1
|
||||||
|
content_score += len(inner_text.split(','))
|
||||||
|
content_score += min((inner_text_len / 100), 3)
|
||||||
|
#if elem not in candidates:
|
||||||
|
# candidates[elem] = self.score_node(elem)
|
||||||
|
|
||||||
|
#WTF? candidates[elem]['content_score'] += content_score
|
||||||
|
candidates[parent_node]['content_score'] += content_score
|
||||||
|
if grand_parent_node is not None:
|
||||||
|
candidates[grand_parent_node]['content_score'] += content_score / 2.0
|
||||||
|
|
||||||
|
# Scale the final candidates score based on link density. Good content should have a
|
||||||
|
# relatively small link density (5% or less) and be mostly unaffected by this operation.
|
||||||
|
for elem in ordered:
|
||||||
|
candidate = candidates[elem]
|
||||||
|
ld = self.get_link_density(elem)
|
||||||
|
score = candidate['content_score']
|
||||||
|
self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
|
||||||
|
candidate['content_score'] *= (1 - ld)
|
||||||
|
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
def class_weight(self, e):
|
||||||
|
weight = 0
|
||||||
|
if e.get('class', None):
|
||||||
|
if REGEXES['negativeRe'].search(e.get('class')):
|
||||||
|
weight -= 25
|
||||||
|
|
||||||
|
if REGEXES['positiveRe'].search(e.get('class')):
|
||||||
|
weight += 25
|
||||||
|
|
||||||
|
if e.get('id', None):
|
||||||
|
if REGEXES['negativeRe'].search(e.get('id')):
|
||||||
|
weight -= 25
|
||||||
|
|
||||||
|
if REGEXES['positiveRe'].search(e.get('id')):
|
||||||
|
weight += 25
|
||||||
|
|
||||||
|
return weight
|
||||||
|
|
||||||
|
def score_node(self, elem):
|
||||||
|
content_score = self.class_weight(elem)
|
||||||
|
name = elem.tag.lower()
|
||||||
|
if name == "div":
|
||||||
|
content_score += 5
|
||||||
|
elif name in ["pre", "td", "blockquote"]:
|
||||||
|
content_score += 3
|
||||||
|
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
|
||||||
|
content_score -= 3
|
||||||
|
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
|
||||||
|
content_score -= 5
|
||||||
|
return {
|
||||||
|
'content_score': content_score,
|
||||||
|
'elem': elem
|
||||||
|
}
|
||||||
|
|
||||||
|
def debug(self, *a):
|
||||||
|
#if self.options['debug']:
|
||||||
|
self.log.debug(*a)
|
||||||
|
|
||||||
|
def remove_unlikely_candidates(self):
|
||||||
|
for elem in self.html.iter():
|
||||||
|
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
|
||||||
|
#self.debug(s)
|
||||||
|
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
|
||||||
|
self.debug("Removing unlikely candidate - %s" % describe(elem))
|
||||||
|
elem.drop_tree()
|
||||||
|
|
||||||
|
def transform_misused_divs_into_paragraphs(self):
|
||||||
|
for elem in self.tags(self.html, 'div'):
|
||||||
|
# transform <div>s that do not contain other block elements into <p>s
|
||||||
|
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
|
||||||
|
#self.debug("Altering %s to p" % (describe(elem)))
|
||||||
|
elem.tag = "p"
|
||||||
|
#print "Fixed element "+describe(elem)
|
||||||
|
|
||||||
|
for elem in self.tags(self.html, 'div'):
|
||||||
|
if elem.text and elem.text.strip():
|
||||||
|
p = fragment_fromstring('<p/>')
|
||||||
|
p.text = elem.text
|
||||||
|
elem.text = None
|
||||||
|
elem.insert(0, p)
|
||||||
|
#print "Appended "+tounicode(p)+" to "+describe(elem)
|
||||||
|
|
||||||
|
for pos, child in reversed(list(enumerate(elem))):
|
||||||
|
if child.tail and child.tail.strip():
|
||||||
|
p = fragment_fromstring('<p/>')
|
||||||
|
p.text = child.tail
|
||||||
|
child.tail = None
|
||||||
|
elem.insert(pos + 1, p)
|
||||||
|
#print "Inserted "+tounicode(p)+" to "+describe(elem)
|
||||||
|
if child.tag == 'br':
|
||||||
|
#print 'Dropped <br> at '+describe(elem)
|
||||||
|
child.drop_tree()
|
||||||
|
|
||||||
|
def tags(self, node, *tag_names):
|
||||||
|
for tag_name in tag_names:
|
||||||
|
for e in node.findall('.//%s' % tag_name):
|
||||||
|
yield e
|
||||||
|
|
||||||
|
def reverse_tags(self, node, *tag_names):
|
||||||
|
for tag_name in tag_names:
|
||||||
|
for e in reversed(node.findall('.//%s' % tag_name)):
|
||||||
|
yield e
|
||||||
|
|
||||||
|
def sanitize(self, node, candidates):
|
||||||
|
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
|
||||||
|
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
|
||||||
|
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
|
||||||
|
header.drop_tree()
|
||||||
|
|
||||||
|
for elem in self.tags(node, "form", "iframe", "textarea"):
|
||||||
|
elem.drop_tree()
|
||||||
|
allowed = {}
|
||||||
|
# Conditionally clean <table>s, <ul>s, and <div>s
|
||||||
|
for el in self.reverse_tags(node, "table", "ul", "div"):
|
||||||
|
if el in allowed:
|
||||||
|
continue
|
||||||
|
weight = self.class_weight(el)
|
||||||
|
if el in candidates:
|
||||||
|
content_score = candidates[el]['content_score']
|
||||||
|
#print '!',el, '-> %6.3f' % content_score
|
||||||
|
else:
|
||||||
|
content_score = 0
|
||||||
|
tag = el.tag
|
||||||
|
|
||||||
|
if weight + content_score < 0:
|
||||||
|
self.debug("Cleaned %s with score %6.3f and weight %-3s" %
|
||||||
|
(describe(el), content_score, weight, ))
|
||||||
|
el.drop_tree()
|
||||||
|
elif el.text_content().count(",") < 10:
|
||||||
|
counts = {}
|
||||||
|
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
|
||||||
|
counts[kind] = len(el.findall('.//%s' %kind))
|
||||||
|
counts["li"] -= 100
|
||||||
|
|
||||||
|
content_length = text_length(el) # Count the text length excluding any surrounding whitespace
|
||||||
|
link_density = self.get_link_density(el)
|
||||||
|
parent_node = el.getparent()
|
||||||
|
if parent_node is not None:
|
||||||
|
if parent_node in candidates:
|
||||||
|
content_score = candidates[parent_node]['content_score']
|
||||||
|
else:
|
||||||
|
content_score = 0
|
||||||
|
#if parent_node is not None:
|
||||||
|
#pweight = self.class_weight(parent_node) + content_score
|
||||||
|
#pname = describe(parent_node)
|
||||||
|
#else:
|
||||||
|
#pweight = 0
|
||||||
|
#pname = "no parent"
|
||||||
|
to_remove = False
|
||||||
|
reason = ""
|
||||||
|
|
||||||
|
#if el.tag == 'div' and counts["img"] >= 1:
|
||||||
|
# continue
|
||||||
|
if counts["p"] and counts["img"] > counts["p"]:
|
||||||
|
reason = "too many images (%s)" % counts["img"]
|
||||||
|
to_remove = True
|
||||||
|
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
|
||||||
|
reason = "more <li>s than <p>s"
|
||||||
|
to_remove = True
|
||||||
|
elif counts["input"] > (counts["p"] / 3):
|
||||||
|
reason = "less than 3x <p>s than <input>s"
|
||||||
|
to_remove = True
|
||||||
|
elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
|
||||||
|
reason = "too short content length %s without a single image" % content_length
|
||||||
|
to_remove = True
|
||||||
|
elif weight < 25 and link_density > 0.2:
|
||||||
|
reason = "too many links %.3f for its weight %s" % (link_density, weight)
|
||||||
|
to_remove = True
|
||||||
|
elif weight >= 25 and link_density > 0.5:
|
||||||
|
reason = "too many links %.3f for its weight %s" % (link_density, weight)
|
||||||
|
to_remove = True
|
||||||
|
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
|
||||||
|
reason = "<embed>s with too short content length, or too many <embed>s"
|
||||||
|
to_remove = True
|
||||||
|
# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
|
||||||
|
# imgs = el.findall('.//img')
|
||||||
|
# valid_img = False
|
||||||
|
# self.debug(tounicode(el))
|
||||||
|
# for img in imgs:
|
||||||
|
#
|
||||||
|
# height = img.get('height')
|
||||||
|
# text_length = img.get('text_length')
|
||||||
|
# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
|
||||||
|
# if to_int(height) >= 100 or to_int(text_length) >= 100:
|
||||||
|
# valid_img = True
|
||||||
|
# self.debug("valid image" + tounicode(img))
|
||||||
|
# break
|
||||||
|
# if valid_img:
|
||||||
|
# to_remove = False
|
||||||
|
# self.debug("Allowing %s" %el.text_content())
|
||||||
|
# for desnode in self.tags(el, "table", "ul", "div"):
|
||||||
|
# allowed[desnode] = True
|
||||||
|
|
||||||
|
#find x non empty preceding and succeeding siblings
|
||||||
|
i, j = 0, 0
|
||||||
|
x = 1
|
||||||
|
siblings = []
|
||||||
|
for sib in el.itersiblings():
|
||||||
|
#self.debug(sib.text_content())
|
||||||
|
sib_content_length = text_length(sib)
|
||||||
|
if sib_content_length:
|
||||||
|
i =+ 1
|
||||||
|
siblings.append(sib_content_length)
|
||||||
|
if i == x:
|
||||||
|
break
|
||||||
|
for sib in el.itersiblings(preceding=True):
|
||||||
|
#self.debug(sib.text_content())
|
||||||
|
sib_content_length = text_length(sib)
|
||||||
|
if sib_content_length:
|
||||||
|
j =+ 1
|
||||||
|
siblings.append(sib_content_length)
|
||||||
|
if j == x:
|
||||||
|
break
|
||||||
|
#self.debug(str(siblings))
|
||||||
|
if siblings and sum(siblings) > 1000 :
|
||||||
|
to_remove = False
|
||||||
|
self.debug("Allowing %s" % describe(el))
|
||||||
|
for desnode in self.tags(el, "table", "ul", "div"):
|
||||||
|
allowed[desnode] = True
|
||||||
|
|
||||||
|
if to_remove:
|
||||||
|
self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
|
||||||
|
(content_score, describe(el), weight, reason))
|
||||||
|
#print tounicode(el)
|
||||||
|
#self.debug("pname %s pweight %.3f" %(pname, pweight))
|
||||||
|
el.drop_tree()
|
||||||
|
|
||||||
|
for el in ([node] + [n for n in node.iter()]):
|
||||||
|
if not (self.options['attributes']):
|
||||||
|
#el.attrib = {} #FIXME:Checkout the effects of disabling this
|
||||||
|
pass
|
||||||
|
|
||||||
|
return clean_attributes(tounicode(node))
|
||||||
|
|
||||||
|
|
||||||
|
class HashableElement():
|
||||||
|
def __init__(self, node):
|
||||||
|
self.node = node
|
||||||
|
self._path = None
|
||||||
|
|
||||||
|
def _get_path(self):
|
||||||
|
if self._path is None:
|
||||||
|
reverse_path = []
|
||||||
|
node = self.node
|
||||||
|
while node is not None:
|
||||||
|
node_id = (node.tag, tuple(node.attrib.items()), node.text)
|
||||||
|
reverse_path.append(node_id)
|
||||||
|
node = node.getparent()
|
||||||
|
self._path = tuple(reverse_path)
|
||||||
|
return self._path
|
||||||
|
path = property(_get_path)
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return hash(self.path)
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.path == other.path
|
||||||
|
|
||||||
|
def __getattr__(self, tag):
|
||||||
|
return getattr(self.node, tag)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import logging
|
||||||
|
from optparse import OptionParser
|
||||||
|
parser = OptionParser(usage="%prog: [options] [file]")
|
||||||
|
parser.add_option('-v', '--verbose', action='store_true')
|
||||||
|
parser.add_option('-u', '--url', help="use URL instead of a local file")
|
||||||
|
(options, args) = parser.parse_args()
|
||||||
|
|
||||||
|
if not (len(args) == 1 or options.url):
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit(1)
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
file = None
|
||||||
|
if options.url:
|
||||||
|
import urllib
|
||||||
|
file = urllib.urlopen(options.url)
|
||||||
|
else:
|
||||||
|
file = open(args[0], 'rt')
|
||||||
|
enc = sys.__stdout__.encoding or 'utf-8'
|
||||||
|
try:
|
||||||
|
print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
|
||||||
|
finally:
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -335,6 +335,7 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):
|
|||||||
geom = gprefs.get('bulk_metadata_window_geometry', None)
|
geom = gprefs.get('bulk_metadata_window_geometry', None)
|
||||||
if geom is not None:
|
if geom is not None:
|
||||||
self.restoreGeometry(bytes(geom))
|
self.restoreGeometry(bytes(geom))
|
||||||
|
self.languages.init_langs(self.db)
|
||||||
self.languages.setEditText('')
|
self.languages.setEditText('')
|
||||||
self.exec_()
|
self.exec_()
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user