Merge from trunk

This commit is contained in:
Charles Haley 2012-10-08 20:11:18 +02:00
commit e0e2a0bf40
14 changed files with 209 additions and 75 deletions

View File

@ -15,6 +15,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
max_articles_per_feed = 20 max_articles_per_feed = 20
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
ignore_duplicate_articles = {'title'}
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')] (re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]

View File

@ -1,11 +1,13 @@
from calibre import browser from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe1325006965(BasicNewsRecipe): class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com' title = u'Countryfile.com'
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg' #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine' description = 'The official website of Countryfile Magazine'
# last updated 9/9//12 # last updated 7/10/12
language = 'en_GB' language = 'en_GB'
oldest_article = 30 oldest_article = 30
max_articles_per_feed = 25 max_articles_per_feed = 25
@ -13,12 +15,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
auto_cleanup = True auto_cleanup = True
#articles_are_obfuscated = True #articles_are_obfuscated = True
ignore_duplicate_articles = {'title'}
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.countryfile.com/') soup = self.index_to_soup('http://www.countryfile.com/')
cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'})
cov = soup.find(attrs={'width' : '160', 'class' : re.compile('imagecache imagecache-160px_wide')})
print '******** ',cov,' ***' print '******** ',cov,' ***'
cov2 = str(cov) cov2 = str(cov)
cov2=cov2[140:223] cov2=cov2[10:101]
print '******** ',cov2,' ***' print '******** ',cov2,' ***'
#cov2='http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/1b_0.jpg' #cov2='http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/1b_0.jpg'
# try to get cover - if can't get known cover # try to get cover - if can't get known cover
@ -40,3 +44,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
(u'Country News', u'http://www.countryfile.com/rss/news'), (u'Country News', u'http://www.countryfile.com/rss/news'),
(u'Countryside', u'http://www.countryfile.com/rss/countryside'), (u'Countryside', u'http://www.countryfile.com/rss/countryside'),
] ]

View File

@ -72,7 +72,7 @@ class DerSpiegel(BasicNewsRecipe):
for article in section.findNextSiblings(['dd','dt']): for article in section.findNextSiblings(['dd','dt']):
if article.name == 'dt': if article.name == 'dt':
break break
link = article.find('a') link = article.find('a', href=True)
title = self.tag_to_string(link).strip() title = self.tag_to_string(link).strip()
if title in self.empty_articles: if title in self.empty_articles:
continue continue

View File

@ -1,5 +1,6 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe): class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'FHM UK' title = u'FHM UK'
description = 'Good News for Men.' description = 'Good News for Men.'
@ -7,14 +8,15 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' # cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif' masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
# last updated 1/7/12 # last updated 7/10/12
language = 'en_GB' language = 'en_GB'
oldest_article = 28 oldest_article = 31
max_articles_per_feed = 8 max_articles_per_feed = 15
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
#auto_cleanup = True #auto_cleanup = True
# articles_are_obfuscated = True # articles_are_obfuscated = True
keep_only_tags = [ keep_only_tags = [
dict(name='h1'), dict(name='h1'),
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}), dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
@ -28,15 +30,13 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
] ]
feeds = [ feeds = [
(u'Homepage 1',u'http://feed43.com/6655867614547036.xml'), # repeatable search = </div>{|}<a href="{%}" class="{*}">{%}</a>{|}<p>{*}</p>
(u'Homepage 2',u'http://feed43.com/4167731873103110.xml'), (u'Homepage',u'http://rss.feedsportal.com/c/375/f/434908/index.rss'),
(u'Homepage 3',u'http://feed43.com/7667138788771570.xml'), (u'Funny',u'http://rss.feedsportal.com/c/375/f/434910/index.rss'),
(u'Homepage 4',u'http://feed43.com/6550421522527341.xml'), (u'Girls',u'http://rss.feedsportal.com/c/375/f/434913/index.rss'),
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
(u'Gaming',u'http://feed43.com/6537162612465672.xml'),
(u'Girls',u'http://feed43.com/4574262733341068.xml'),# edit link http://feed43.com/feed.html?name=4574262733341068
] ]
extra_css = ''' extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}

View File

@ -4,7 +4,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'New Musical Express Magazine' title = u'New Musical Express Magazine'
description = 'Author D.Asbury. UK Rock & Pop Mag. ' description = 'Author D.Asbury. UK Rock & Pop Mag. '
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
# last updated 9/6/12 # last updated 7/10/12
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
@ -14,15 +14,13 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
language = 'en_GB' language = 'en_GB'
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.magazinesdirect.com/categories/mens/tv-and-music/') soup = self.index_to_soup('http://www.nme.com/component/subscribe')
cov = soup.find(attrs={'title' : 'NME magazine subscriptions'}) cov = soup.find(attrs={'id' : 'magazine_cover'})
cov2 = 'http://www.magazinesdirect.com'+cov['src'] cov2 = str(cov['src'])
print '***cov = ',cov2,' ***'
cover_url = str(cov2)
# print '**** Cov url =*', cover_url,'***' # print '**** Cov url =*', cover_url,'***'
#print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***' #print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***'
br = browser() br = browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
try: try:
@ -31,8 +29,8 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
except: except:
cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg' cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
return cover_url return cover_url
masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
remove_tags = [ remove_tags = [
dict( attrs={'class':'clear_icons'}), dict( attrs={'class':'clear_icons'}),
@ -61,9 +59,15 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
feeds = [ feeds = [
(u'NME News', u'http://feeds2.feedburner.com/nmecom/rss/newsxml'), (u'NME News', u'http://feeds.feedburner.com/nmecom/rss/newsxml?format=xml'),
#(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'), #(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'),
(u'Reviews',u'http://feed43.com/4138608576351646.xml'), (u'Reviews',u'http://feed43.com/1817687144061333.xml'),
(u'Bloggs',u'http://feed43.com/3326754333186048.xml'), (u'Bloggs',u'http://feed43.com/3326754333186048.xml'),
] ]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''

18
recipes/pvp_online.recipe Normal file
View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1344926684(BasicNewsRecipe):
title = u'PVP online'
__author__ = 'Krittika Goyal'
oldest_article = 7
max_articles_per_feed = 100
#auto_cleanup = True
no_stylesheets = True
use_embedded_content = False
language = 'en'
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'class':'body'})]
remove_tags = [dict(name='div', attrs={'class':'prevBg'}),dict(name='div', attrs={'class':'nextBg'}),dict(name='div', attrs={'class':'postMeta'})]
feeds = [(u'Comics', u'http://pvponline.com/feed'), ]

View File

@ -5,13 +5,15 @@ class AdvancedUserRecipe1324663493(BasicNewsRecipe):
title = u'Shortlist' title = u'Shortlist'
description = 'Articles From Shortlist.com' description = 'Articles From Shortlist.com'
# I've set oldest article to 7 days as the website updates weekly # I've set oldest article to 7 days as the website updates weekly
oldest_article = 7 oldest_article = 8
max_articles_per_feed = 12 max_articles_per_feed = 20
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'title'}
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
# last updated 19/5/12 # last updated 7/10/12
language = 'en_GB' language = 'en_GB'
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.shortlist.com') soup = self.index_to_soup('http://www.shortlist.com')
@ -45,17 +47,16 @@ class AdvancedUserRecipe1324663493(BasicNewsRecipe):
] ]
feeds = [ feeds = [
(u'Home carousel',u'http://feed43.com/7106317222455380.xml'), #edit http://feed43.com/feed.html?name=3156308700147005
(u'This Weeks Issue', u'http://feed43.com/0323588208751786.xml'), # repeatable pattern = <h3>{_}<a href="{%}">{%}</a>{*}</h3>
(u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'),
(u'Style',u'http://feed43.com/7217107577215678.xml'), (u'This Weeks Issue', u'http://feed43.com/5205766657404804.xml'),
(u'Films',u'http://feed43.com/3101308515277265.xml'), (u'Home Page',u'http://feed43.com/3156308700147005.xml'),
(u'Music',u'http://feed43.com/2416400550560162.xml'), (u'Cool Stuff',u'http://feed43.com/1557051772026706.xml'),
(u'TV',u'http://feed43.com/4781172470717123.xml'), (u'Style',u'http://feed43.com/4168836374571502.xml'),
(u'Sport',u'http://feed43.com/5303151885853308.xml'), (u'Entertainment',u'http://feed43.com/4578504030588024.xml'),
(u'Gaming',u'http://feed43.com/8883764600355347.xml'),
(u'Women',u'http://feed43.com/2648221746514241.xml'),
(u'Instant Improver', u'http://feed43.com/1236541026275417.xml'),
#(u'Articles', u'http://feed43.com/3428534448355545.xml')
] ]

View File

@ -40,6 +40,7 @@ class ANDROID(USBMS):
0xca4 : HTC_BCDS, 0xca4 : HTC_BCDS,
0xca9 : HTC_BCDS, 0xca9 : HTC_BCDS,
0xcac : HTC_BCDS, 0xcac : HTC_BCDS,
0xcba : HTC_BCDS,
0xccf : HTC_BCDS, 0xccf : HTC_BCDS,
0xcd6 : HTC_BCDS, 0xcd6 : HTC_BCDS,
0xce5 : HTC_BCDS, 0xce5 : HTC_BCDS,

View File

@ -12,19 +12,17 @@ Originally developed by Timothy Legge <timlegge@gmail.com>.
Extended to support Touch firmware 2.0.0 and later and newer devices by David Forrester <davidfor@internode.on.net> Extended to support Touch firmware 2.0.0 and later and newer devices by David Forrester <davidfor@internode.on.net>
''' '''
import os, time, calendar import os, time
from contextlib import closing from contextlib import closing
from calibre.devices.usbms.books import BookList from calibre.devices.usbms.books import BookList
from calibre.devices.usbms.books import CollectionsBookList from calibre.devices.usbms.books import CollectionsBookList
from calibre.devices.kobo.books import KTCollectionsBookList from calibre.devices.kobo.books import KTCollectionsBookList
from calibre.devices.kobo.books import Book from calibre.devices.kobo.books import Book
from calibre.devices.kobo.books import ImageWrapper from calibre.devices.kobo.books import ImageWrapper
from calibre.devices.kobo.bookmark import Bookmark
from calibre.devices.mime import mime_type_ext from calibre.devices.mime import mime_type_ext
from calibre.devices.usbms.driver import USBMS, debug_print from calibre.devices.usbms.driver import USBMS, debug_print
from calibre import prints from calibre import prints
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.constants import DEBUG from calibre.constants import DEBUG
from calibre.utils.config import prefs from calibre.utils.config import prefs
@ -994,6 +992,7 @@ class KOBO(USBMS):
return USBMS.create_annotations_path(self, mdata) return USBMS.create_annotations_path(self, mdata)
def get_annotations(self, path_map): def get_annotations(self, path_map):
from calibre.devices.kobo.bookmark import Bookmark
EPUB_FORMATS = [u'epub'] EPUB_FORMATS = [u'epub']
epub_formats = set(EPUB_FORMATS) epub_formats = set(EPUB_FORMATS)
@ -1056,6 +1055,7 @@ class KOBO(USBMS):
return bookmarked_books return bookmarked_books
def generate_annotation_html(self, bookmark): def generate_annotation_html(self, bookmark):
import calendar
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
# Returns <div class="user_annotations"> ... </div> # Returns <div class="user_annotations"> ... </div>
#last_read_location = bookmark.last_read_location #last_read_location = bookmark.last_read_location

View File

@ -6,15 +6,19 @@ __docformat__ = 'restructuredtext en'
''' '''
Convert an ODT file into a Open Ebook Convert an ODT file into a Open Ebook
''' '''
import os import os, logging
from lxml import etree from lxml import etree
from cssutils import CSSParser
from cssutils.css import CSSRule
from odf.odf2xhtml import ODF2XHTML from odf.odf2xhtml import ODF2XHTML
from odf.opendocument import load as odLoad from odf.opendocument import load as odLoad
from odf.draw import Frame as odFrame, Image as odImage from odf.draw import Frame as odFrame, Image as odImage
from odf.namespaces import TEXTNS as odTEXTNS from odf.namespaces import TEXTNS as odTEXTNS
from calibre import CurrentDir, walk from calibre import CurrentDir, walk
from calibre.ebooks.oeb.base import _css_logger
class Extract(ODF2XHTML): class Extract(ODF2XHTML):
@ -29,14 +33,14 @@ class Extract(ODF2XHTML):
def fix_markup(self, html, log): def fix_markup(self, html, log):
root = etree.fromstring(html) root = etree.fromstring(html)
self.epubify_markup(root, log)
self.filter_css(root, log) self.filter_css(root, log)
self.extract_css(root) self.extract_css(root, log)
self.epubify_markup(root, log)
html = etree.tostring(root, encoding='utf-8', html = etree.tostring(root, encoding='utf-8',
xml_declaration=True) xml_declaration=True)
return html return html
def extract_css(self, root): def extract_css(self, root, log):
ans = [] ans = []
for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'): for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
ans.append(s.text) ans.append(s.text)
@ -51,9 +55,21 @@ class Extract(ODF2XHTML):
etree.SubElement(head, ns+'link', {'type':'text/css', etree.SubElement(head, ns+'link', {'type':'text/css',
'rel':'stylesheet', 'href':'odfpy.css'}) 'rel':'stylesheet', 'href':'odfpy.css'})
with open('odfpy.css', 'wb') as f: css = u'\n\n'.join(ans)
f.write((u'\n\n'.join(ans)).encode('utf-8')) parser = CSSParser(loglevel=logging.WARNING,
log=_css_logger)
self.css = parser.parseString(css, validate=False)
with open('odfpy.css', 'wb') as f:
f.write(css.encode('utf-8'))
def get_css_for_class(self, cls):
if not cls: return None
for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE):
for sel in rule.selectorList:
q = sel.selectorText
if q == '.' + cls:
return rule
def epubify_markup(self, root, log): def epubify_markup(self, root, log):
from calibre.ebooks.oeb.base import XPath, XHTML from calibre.ebooks.oeb.base import XPath, XHTML
@ -84,16 +100,54 @@ class Extract(ODF2XHTML):
div.attrib['style'] = style div.attrib['style'] = style
img.attrib['style'] = 'max-width: 100%; max-height: 100%' img.attrib['style'] = 'max-width: 100%; max-height: 100%'
# A div/div/img construct causes text-align:center to not work in ADE # Handle anchored images. The default markup + CSS produced by
# so set the display of the second div to inline. This should have no # odf2xhtml works with WebKit but not with ADE. So we convert the
# effect (apart from minor vspace issues) in a compliant HTML renderer # common cases of left/right/center aligned block images to work on
# but it fixes the centering of the image via a text-align:center on # both webkit and ADE. We detect the case of setting the side margins
# the first div in ADE # to auto and map it to an appropriate text-align directive, which
# works in both WebKit and ADE.
# https://bugs.launchpad.net/bugs/1063207
# https://bugs.launchpad.net/calibre/+bug/859343
imgpath = XPath('descendant::h:div/h:div/h:img') imgpath = XPath('descendant::h:div/h:div/h:img')
for img in imgpath(root): for img in imgpath(root):
div2 = img.getparent() div2 = img.getparent()
div1 = div2.getparent() div1 = div2.getparent()
if len(div1) == len(div2) == 1: if (len(div1), len(div2)) != (1, 1): continue
cls = div1.get('class', '')
first_rules = filter(None, [self.get_css_for_class(x) for x in
cls.split()])
has_align = False
for r in first_rules:
if r.style.getProperty(u'text-align') is not None:
has_align = True
ml = mr = None
if not has_align:
aval = None
cls = div2.get(u'class', u'')
rules = filter(None, [self.get_css_for_class(x) for x in
cls.split()])
for r in rules:
ml = r.style.getPropertyCSSValue(u'margin-left') or ml
mr = r.style.getPropertyCSSValue(u'margin-right') or mr
ml = getattr(ml, 'value', None)
mr = getattr(mr, 'value', None)
if ml == mr == u'auto':
aval = u'center'
elif ml == u'auto' and mr != u'auto':
aval = 'right'
elif ml != u'auto' and mr == u'auto':
aval = 'left'
if aval is not None:
style = div1.attrib.get('style', '').strip()
if style and not style.endswith(';'):
style = style + ';'
style += 'text-align:%s'%aval
has_align = True
div1.attrib['style'] = style
if has_align:
# This is needed for ADE, without it the text-align has no
# effect
style = div2.attrib['style'] style = div2.attrib['style']
div2.attrib['style'] = 'display:inline;'+style div2.attrib['style'] = 'display:inline;'+style

View File

@ -211,23 +211,25 @@ def main(args=sys.argv):
msg = compose_mail(args[1], args[2], args[3], subject=opts.subject, msg = compose_mail(args[1], args[2], args[3], subject=opts.subject,
attachment=opts.attachment) attachment=opts.attachment)
from_, to = args[1:3] from_, to = args[1:3]
efrom, eto = map(extract_email_address, (from_, to)) eto = [extract_email_address(x.strip()) for x in to.split(',')]
eto = [eto] efrom = extract_email_address(from_)
else: else:
msg = sys.stdin.read() msg = sys.stdin.read()
from email.parser import Parser from email import message_from_string
from email.utils import getaddresses from email.utils import getaddresses
eml = Parser.parsestr(msg, headersonly=True) eml = message_from_string(msg)
tos = eml.get_all('to', []) tos = eml.get_all('to', [])
ccs = eml.get_all('cc', []) ccs = eml.get_all('cc', []) + eml.get_all('bcc', [])
eto = getaddresses(tos + ccs) all_tos = []
for x in tos + ccs:
all_tos.extend(y.strip() for y in x.split(','))
eto = list(map(extract_email_address, all_tos))
if not eto: if not eto:
raise ValueError('Email from STDIN does not specify any recipients') raise ValueError('Email from STDIN does not specify any recipients')
efrom = getaddresses(eml.get_all('from', [])) efrom = getaddresses(eml.get_all('from', []))
if not efrom: if not efrom:
raise ValueError('Email from STDIN does not specify a sender') raise ValueError('Email from STDIN does not specify a sender')
efrom = efrom[0] efrom = efrom[0][1]
outbox = None outbox = None
if opts.outbox is not None: if opts.outbox is not None:

View File

@ -265,6 +265,12 @@ class Feed(object):
if i > -1: if i > -1:
self.articles[i:i+1] = [] self.articles[i:i+1] = []
def remove_article(self, article):
try:
self.articles.remove(article)
except ValueError:
pass
class FeedCollection(list): class FeedCollection(list):
def __init__(self, feeds): def __init__(self, feeds):

View File

@ -167,9 +167,10 @@ class BasicNewsRecipe(Recipe):
extra_css = None extra_css = None
#: If True empty feeds are removed from the output. #: If True empty feeds are removed from the output.
#: This option has no effect if parse_index is overriden in #: This option has no effect if parse_index is overridden in
#: the sub class. It is meant only for recipes that return a list #: the sub class. It is meant only for recipes that return a list
#: of feeds using `feeds` or :meth:`get_feeds`. #: of feeds using `feeds` or :meth:`get_feeds`. It is also used if you use
#: the ignore_duplicate_articles option.
remove_empty_feeds = False remove_empty_feeds = False
#: List of regular expressions that determines which links to follow #: List of regular expressions that determines which links to follow
@ -321,6 +322,15 @@ class BasicNewsRecipe(Recipe):
#: The string will be used as the disabled message #: The string will be used as the disabled message
recipe_disabled = None recipe_disabled = None
#: Ignore duplicates of articles that are present in more than one section.
#: A duplicate article is an article that has the same title and/or URL.
#: To ignore articles with the same title, set this to:
#: ignore_duplicate_articles = {'title'}
#: To use URLs instead, set it to:
#: ignore_duplicate_articles = {'url'}
#: To match on title or URL, set it to:
#: ignore_duplicate_articles = {'title', 'url'}
ignore_duplicate_articles = None
# See the built-in profiles for examples of these settings. # See the built-in profiles for examples of these settings.
@ -1019,6 +1029,28 @@ class BasicNewsRecipe(Recipe):
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name) url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
return self._fetch_article(url, dir, f, a, num_of_feeds) return self._fetch_article(url, dir, f, a, num_of_feeds)
def remove_duplicate_articles(self, feeds):
seen_keys = defaultdict(set)
remove = []
for f in feeds:
for article in f:
for key in self.ignore_duplicate_articles:
val = getattr(article, key)
seen = seen_keys[key]
if val:
if val in seen:
remove.append((f, article))
else:
seen.add(val)
for feed, article in remove:
self.log.debug('Removing duplicate article: %s from section: %s'%(
article.title, feed.title))
feed.remove_article(article)
if self.remove_empty_feeds:
feeds = [f for f in feeds if len(f) > 0]
return feeds
def build_index(self): def build_index(self):
self.report_progress(0, _('Fetching feeds...')) self.report_progress(0, _('Fetching feeds...'))
@ -1033,6 +1065,9 @@ class BasicNewsRecipe(Recipe):
if not feeds: if not feeds:
raise ValueError('No articles found, aborting') raise ValueError('No articles found, aborting')
if self.ignore_duplicate_articles is not None:
feeds = self.remove_duplicate_articles(feeds)
#feeds = FeedCollection(feeds) #feeds = FeedCollection(feeds)
self.report_progress(0, _('Trying to download cover...')) self.report_progress(0, _('Trying to download cover...'))

View File

@ -68,7 +68,12 @@ def serialize_collection(mapping_of_recipe_classes):
key=lambda key: force_unicode( key=lambda key: force_unicode(
getattr(mapping_of_recipe_classes[key], 'title', 'zzz'), getattr(mapping_of_recipe_classes[key], 'title', 'zzz'),
'utf-8')): 'utf-8')):
try:
recipe = serialize_recipe(urn, mapping_of_recipe_classes[urn]) recipe = serialize_recipe(urn, mapping_of_recipe_classes[urn])
except:
import traceback
traceback.print_exc()
continue
collection.append(recipe) collection.append(recipe)
collection.set('count', str(len(collection))) collection.set('count', str(len(collection)))
return etree.tostring(collection, encoding='utf-8', xml_declaration=True, return etree.tostring(collection, encoding='utf-8', xml_declaration=True,