mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
e0e2a0bf40
@ -15,6 +15,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
ignore_duplicate_articles = {'title'}
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
@ -1,11 +1,13 @@
|
||||
from calibre import browser
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
title = u'Countryfile.com'
|
||||
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
||||
__author__ = 'Dave Asbury'
|
||||
description = 'The official website of Countryfile Magazine'
|
||||
# last updated 9/9//12
|
||||
# last updated 7/10/12
|
||||
language = 'en_GB'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 25
|
||||
@ -13,12 +15,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
#articles_are_obfuscated = True
|
||||
ignore_duplicate_articles = {'title'}
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.countryfile.com/')
|
||||
cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'})
|
||||
|
||||
cov = soup.find(attrs={'width' : '160', 'class' : re.compile('imagecache imagecache-160px_wide')})
|
||||
print '******** ',cov,' ***'
|
||||
cov2 = str(cov)
|
||||
cov2=cov2[140:223]
|
||||
cov2=cov2[10:101]
|
||||
print '******** ',cov2,' ***'
|
||||
#cov2='http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/1b_0.jpg'
|
||||
# try to get cover - if can't get known cover
|
||||
@ -40,3 +44,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
(u'Country News', u'http://www.countryfile.com/rss/news'),
|
||||
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
@ -72,7 +72,7 @@ class DerSpiegel(BasicNewsRecipe):
|
||||
for article in section.findNextSiblings(['dd','dt']):
|
||||
if article.name == 'dt':
|
||||
break
|
||||
link = article.find('a')
|
||||
link = article.find('a', href=True)
|
||||
title = self.tag_to_string(link).strip()
|
||||
if title in self.empty_articles:
|
||||
continue
|
||||
|
@ -1,5 +1,6 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
title = u'FHM UK'
|
||||
description = 'Good News for Men.'
|
||||
@ -7,14 +8,15 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 1/7/12
|
||||
# last updated 7/10/12
|
||||
language = 'en_GB'
|
||||
oldest_article = 28
|
||||
max_articles_per_feed = 8
|
||||
oldest_article = 31
|
||||
max_articles_per_feed = 15
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
#auto_cleanup = True
|
||||
# articles_are_obfuscated = True
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),
|
||||
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
|
||||
@ -28,15 +30,13 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
|
||||
]
|
||||
feeds = [
|
||||
(u'Homepage 1',u'http://feed43.com/6655867614547036.xml'),
|
||||
(u'Homepage 2',u'http://feed43.com/4167731873103110.xml'),
|
||||
(u'Homepage 3',u'http://feed43.com/7667138788771570.xml'),
|
||||
(u'Homepage 4',u'http://feed43.com/6550421522527341.xml'),
|
||||
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
|
||||
(u'Gaming',u'http://feed43.com/6537162612465672.xml'),
|
||||
(u'Girls',u'http://feed43.com/4574262733341068.xml'),# edit link http://feed43.com/feed.html?name=4574262733341068
|
||||
# repeatable search = </div>{|}<a href="{%}" class="{*}">{%}</a>{|}<p>{*}</p>
|
||||
(u'Homepage',u'http://rss.feedsportal.com/c/375/f/434908/index.rss'),
|
||||
(u'Funny',u'http://rss.feedsportal.com/c/375/f/434910/index.rss'),
|
||||
(u'Girls',u'http://rss.feedsportal.com/c/375/f/434913/index.rss'),
|
||||
]
|
||||
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
|
@ -4,7 +4,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
title = u'New Musical Express Magazine'
|
||||
description = 'Author D.Asbury. UK Rock & Pop Mag. '
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 9/6/12
|
||||
# last updated 7/10/12
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
@ -14,15 +14,13 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
language = 'en_GB'
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.magazinesdirect.com/categories/mens/tv-and-music/')
|
||||
cov = soup.find(attrs={'title' : 'NME magazine subscriptions'})
|
||||
cov2 = 'http://www.magazinesdirect.com'+cov['src']
|
||||
print '***cov = ',cov2,' ***'
|
||||
|
||||
cover_url = str(cov2)
|
||||
soup = self.index_to_soup('http://www.nme.com/component/subscribe')
|
||||
cov = soup.find(attrs={'id' : 'magazine_cover'})
|
||||
cov2 = str(cov['src'])
|
||||
# print '**** Cov url =*', cover_url,'***'
|
||||
#print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***'
|
||||
|
||||
|
||||
br = browser()
|
||||
br.set_handle_redirect(False)
|
||||
try:
|
||||
@ -31,8 +29,8 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
except:
|
||||
cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
||||
return cover_url
|
||||
masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
||||
|
||||
masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
|
||||
|
||||
remove_tags = [
|
||||
dict( attrs={'class':'clear_icons'}),
|
||||
@ -61,9 +59,15 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'NME News', u'http://feeds2.feedburner.com/nmecom/rss/newsxml'),
|
||||
(u'NME News', u'http://feeds.feedburner.com/nmecom/rss/newsxml?format=xml'),
|
||||
#(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'),
|
||||
(u'Reviews',u'http://feed43.com/4138608576351646.xml'),
|
||||
(u'Reviews',u'http://feed43.com/1817687144061333.xml'),
|
||||
(u'Bloggs',u'http://feed43.com/3326754333186048.xml'),
|
||||
|
||||
]
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
18
recipes/pvp_online.recipe
Normal file
18
recipes/pvp_online.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1344926684(BasicNewsRecipe):
|
||||
title = u'PVP online'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
#auto_cleanup = True
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'en'
|
||||
remove_javascript = True
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'body'})]
|
||||
remove_tags = [dict(name='div', attrs={'class':'prevBg'}),dict(name='div', attrs={'class':'nextBg'}),dict(name='div', attrs={'class':'postMeta'})]
|
||||
|
||||
feeds = [(u'Comics', u'http://pvponline.com/feed'), ]
|
||||
|
@ -5,13 +5,15 @@ class AdvancedUserRecipe1324663493(BasicNewsRecipe):
|
||||
title = u'Shortlist'
|
||||
description = 'Articles From Shortlist.com'
|
||||
# I've set oldest article to 7 days as the website updates weekly
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 12
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
ignore_duplicate_articles = {'title'}
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 19/5/12
|
||||
# last updated 7/10/12
|
||||
language = 'en_GB'
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.shortlist.com')
|
||||
@ -45,17 +47,16 @@ class AdvancedUserRecipe1324663493(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Home carousel',u'http://feed43.com/7106317222455380.xml'),
|
||||
(u'This Weeks Issue', u'http://feed43.com/0323588208751786.xml'),
|
||||
(u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'),
|
||||
(u'Style',u'http://feed43.com/7217107577215678.xml'),
|
||||
(u'Films',u'http://feed43.com/3101308515277265.xml'),
|
||||
(u'Music',u'http://feed43.com/2416400550560162.xml'),
|
||||
(u'TV',u'http://feed43.com/4781172470717123.xml'),
|
||||
(u'Sport',u'http://feed43.com/5303151885853308.xml'),
|
||||
(u'Gaming',u'http://feed43.com/8883764600355347.xml'),
|
||||
(u'Women',u'http://feed43.com/2648221746514241.xml'),
|
||||
(u'Instant Improver', u'http://feed43.com/1236541026275417.xml'),
|
||||
#edit http://feed43.com/feed.html?name=3156308700147005
|
||||
# repeatable pattern = <h3>{_}<a href="{%}">{%}</a>{*}</h3>
|
||||
|
||||
(u'This Weeks Issue', u'http://feed43.com/5205766657404804.xml'),
|
||||
(u'Home Page',u'http://feed43.com/3156308700147005.xml'),
|
||||
(u'Cool Stuff',u'http://feed43.com/1557051772026706.xml'),
|
||||
(u'Style',u'http://feed43.com/4168836374571502.xml'),
|
||||
(u'Entertainment',u'http://feed43.com/4578504030588024.xml'),
|
||||
|
||||
|
||||
#(u'Articles', u'http://feed43.com/3428534448355545.xml')
|
||||
]
|
||||
|
||||
|
||||
|
@ -40,6 +40,7 @@ class ANDROID(USBMS):
|
||||
0xca4 : HTC_BCDS,
|
||||
0xca9 : HTC_BCDS,
|
||||
0xcac : HTC_BCDS,
|
||||
0xcba : HTC_BCDS,
|
||||
0xccf : HTC_BCDS,
|
||||
0xcd6 : HTC_BCDS,
|
||||
0xce5 : HTC_BCDS,
|
||||
|
@ -12,19 +12,17 @@ Originally developed by Timothy Legge <timlegge@gmail.com>.
|
||||
Extended to support Touch firmware 2.0.0 and later and newer devices by David Forrester <davidfor@internode.on.net>
|
||||
'''
|
||||
|
||||
import os, time, calendar
|
||||
import os, time
|
||||
from contextlib import closing
|
||||
from calibre.devices.usbms.books import BookList
|
||||
from calibre.devices.usbms.books import CollectionsBookList
|
||||
from calibre.devices.kobo.books import KTCollectionsBookList
|
||||
from calibre.devices.kobo.books import Book
|
||||
from calibre.devices.kobo.books import ImageWrapper
|
||||
from calibre.devices.kobo.bookmark import Bookmark
|
||||
from calibre.devices.mime import mime_type_ext
|
||||
from calibre.devices.usbms.driver import USBMS, debug_print
|
||||
from calibre import prints
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
from calibre.constants import DEBUG
|
||||
from calibre.utils.config import prefs
|
||||
|
||||
@ -994,6 +992,7 @@ class KOBO(USBMS):
|
||||
return USBMS.create_annotations_path(self, mdata)
|
||||
|
||||
def get_annotations(self, path_map):
|
||||
from calibre.devices.kobo.bookmark import Bookmark
|
||||
EPUB_FORMATS = [u'epub']
|
||||
epub_formats = set(EPUB_FORMATS)
|
||||
|
||||
@ -1056,6 +1055,7 @@ class KOBO(USBMS):
|
||||
return bookmarked_books
|
||||
|
||||
def generate_annotation_html(self, bookmark):
|
||||
import calendar
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
|
||||
# Returns <div class="user_annotations"> ... </div>
|
||||
#last_read_location = bookmark.last_read_location
|
||||
|
@ -6,15 +6,19 @@ __docformat__ = 'restructuredtext en'
|
||||
'''
|
||||
Convert an ODT file into a Open Ebook
|
||||
'''
|
||||
import os
|
||||
import os, logging
|
||||
|
||||
from lxml import etree
|
||||
from cssutils import CSSParser
|
||||
from cssutils.css import CSSRule
|
||||
|
||||
from odf.odf2xhtml import ODF2XHTML
|
||||
from odf.opendocument import load as odLoad
|
||||
from odf.draw import Frame as odFrame, Image as odImage
|
||||
from odf.namespaces import TEXTNS as odTEXTNS
|
||||
|
||||
from calibre import CurrentDir, walk
|
||||
from calibre.ebooks.oeb.base import _css_logger
|
||||
|
||||
class Extract(ODF2XHTML):
|
||||
|
||||
@ -29,14 +33,14 @@ class Extract(ODF2XHTML):
|
||||
|
||||
def fix_markup(self, html, log):
|
||||
root = etree.fromstring(html)
|
||||
self.epubify_markup(root, log)
|
||||
self.filter_css(root, log)
|
||||
self.extract_css(root)
|
||||
self.extract_css(root, log)
|
||||
self.epubify_markup(root, log)
|
||||
html = etree.tostring(root, encoding='utf-8',
|
||||
xml_declaration=True)
|
||||
return html
|
||||
|
||||
def extract_css(self, root):
|
||||
def extract_css(self, root, log):
|
||||
ans = []
|
||||
for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
|
||||
ans.append(s.text)
|
||||
@ -51,9 +55,21 @@ class Extract(ODF2XHTML):
|
||||
etree.SubElement(head, ns+'link', {'type':'text/css',
|
||||
'rel':'stylesheet', 'href':'odfpy.css'})
|
||||
|
||||
with open('odfpy.css', 'wb') as f:
|
||||
f.write((u'\n\n'.join(ans)).encode('utf-8'))
|
||||
css = u'\n\n'.join(ans)
|
||||
parser = CSSParser(loglevel=logging.WARNING,
|
||||
log=_css_logger)
|
||||
self.css = parser.parseString(css, validate=False)
|
||||
|
||||
with open('odfpy.css', 'wb') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
|
||||
def get_css_for_class(self, cls):
|
||||
if not cls: return None
|
||||
for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
for sel in rule.selectorList:
|
||||
q = sel.selectorText
|
||||
if q == '.' + cls:
|
||||
return rule
|
||||
|
||||
def epubify_markup(self, root, log):
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML
|
||||
@ -84,16 +100,54 @@ class Extract(ODF2XHTML):
|
||||
div.attrib['style'] = style
|
||||
img.attrib['style'] = 'max-width: 100%; max-height: 100%'
|
||||
|
||||
# A div/div/img construct causes text-align:center to not work in ADE
|
||||
# so set the display of the second div to inline. This should have no
|
||||
# effect (apart from minor vspace issues) in a compliant HTML renderer
|
||||
# but it fixes the centering of the image via a text-align:center on
|
||||
# the first div in ADE
|
||||
# Handle anchored images. The default markup + CSS produced by
|
||||
# odf2xhtml works with WebKit but not with ADE. So we convert the
|
||||
# common cases of left/right/center aligned block images to work on
|
||||
# both webkit and ADE. We detect the case of setting the side margins
|
||||
# to auto and map it to an appropriate text-align directive, which
|
||||
# works in both WebKit and ADE.
|
||||
# https://bugs.launchpad.net/bugs/1063207
|
||||
# https://bugs.launchpad.net/calibre/+bug/859343
|
||||
imgpath = XPath('descendant::h:div/h:div/h:img')
|
||||
for img in imgpath(root):
|
||||
div2 = img.getparent()
|
||||
div1 = div2.getparent()
|
||||
if len(div1) == len(div2) == 1:
|
||||
if (len(div1), len(div2)) != (1, 1): continue
|
||||
cls = div1.get('class', '')
|
||||
first_rules = filter(None, [self.get_css_for_class(x) for x in
|
||||
cls.split()])
|
||||
has_align = False
|
||||
for r in first_rules:
|
||||
if r.style.getProperty(u'text-align') is not None:
|
||||
has_align = True
|
||||
ml = mr = None
|
||||
if not has_align:
|
||||
aval = None
|
||||
cls = div2.get(u'class', u'')
|
||||
rules = filter(None, [self.get_css_for_class(x) for x in
|
||||
cls.split()])
|
||||
for r in rules:
|
||||
ml = r.style.getPropertyCSSValue(u'margin-left') or ml
|
||||
mr = r.style.getPropertyCSSValue(u'margin-right') or mr
|
||||
ml = getattr(ml, 'value', None)
|
||||
mr = getattr(mr, 'value', None)
|
||||
if ml == mr == u'auto':
|
||||
aval = u'center'
|
||||
elif ml == u'auto' and mr != u'auto':
|
||||
aval = 'right'
|
||||
elif ml != u'auto' and mr == u'auto':
|
||||
aval = 'left'
|
||||
if aval is not None:
|
||||
style = div1.attrib.get('style', '').strip()
|
||||
if style and not style.endswith(';'):
|
||||
style = style + ';'
|
||||
style += 'text-align:%s'%aval
|
||||
has_align = True
|
||||
div1.attrib['style'] = style
|
||||
|
||||
if has_align:
|
||||
# This is needed for ADE, without it the text-align has no
|
||||
# effect
|
||||
style = div2.attrib['style']
|
||||
div2.attrib['style'] = 'display:inline;'+style
|
||||
|
||||
|
@ -211,23 +211,25 @@ def main(args=sys.argv):
|
||||
msg = compose_mail(args[1], args[2], args[3], subject=opts.subject,
|
||||
attachment=opts.attachment)
|
||||
from_, to = args[1:3]
|
||||
efrom, eto = map(extract_email_address, (from_, to))
|
||||
eto = [eto]
|
||||
eto = [extract_email_address(x.strip()) for x in to.split(',')]
|
||||
efrom = extract_email_address(from_)
|
||||
else:
|
||||
msg = sys.stdin.read()
|
||||
from email.parser import Parser
|
||||
from email import message_from_string
|
||||
from email.utils import getaddresses
|
||||
eml = Parser.parsestr(msg, headersonly=True)
|
||||
eml = message_from_string(msg)
|
||||
tos = eml.get_all('to', [])
|
||||
ccs = eml.get_all('cc', [])
|
||||
eto = getaddresses(tos + ccs)
|
||||
ccs = eml.get_all('cc', []) + eml.get_all('bcc', [])
|
||||
all_tos = []
|
||||
for x in tos + ccs:
|
||||
all_tos.extend(y.strip() for y in x.split(','))
|
||||
eto = list(map(extract_email_address, all_tos))
|
||||
if not eto:
|
||||
raise ValueError('Email from STDIN does not specify any recipients')
|
||||
efrom = getaddresses(eml.get_all('from', []))
|
||||
if not efrom:
|
||||
raise ValueError('Email from STDIN does not specify a sender')
|
||||
efrom = efrom[0]
|
||||
|
||||
efrom = efrom[0][1]
|
||||
|
||||
outbox = None
|
||||
if opts.outbox is not None:
|
||||
|
@ -265,6 +265,12 @@ class Feed(object):
|
||||
if i > -1:
|
||||
self.articles[i:i+1] = []
|
||||
|
||||
def remove_article(self, article):
|
||||
try:
|
||||
self.articles.remove(article)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
class FeedCollection(list):
|
||||
|
||||
def __init__(self, feeds):
|
||||
|
@ -167,9 +167,10 @@ class BasicNewsRecipe(Recipe):
|
||||
extra_css = None
|
||||
|
||||
#: If True empty feeds are removed from the output.
|
||||
#: This option has no effect if parse_index is overriden in
|
||||
#: This option has no effect if parse_index is overridden in
|
||||
#: the sub class. It is meant only for recipes that return a list
|
||||
#: of feeds using `feeds` or :meth:`get_feeds`.
|
||||
#: of feeds using `feeds` or :meth:`get_feeds`. It is also used if you use
|
||||
#: the ignore_duplicate_articles option.
|
||||
remove_empty_feeds = False
|
||||
|
||||
#: List of regular expressions that determines which links to follow
|
||||
@ -321,6 +322,15 @@ class BasicNewsRecipe(Recipe):
|
||||
#: The string will be used as the disabled message
|
||||
recipe_disabled = None
|
||||
|
||||
#: Ignore duplicates of articles that are present in more than one section.
|
||||
#: A duplicate article is an article that has the same title and/or URL.
|
||||
#: To ignore articles with the same title, set this to:
|
||||
#: ignore_duplicate_articles = {'title'}
|
||||
#: To use URLs instead, set it to:
|
||||
#: ignore_duplicate_articles = {'url'}
|
||||
#: To match on title or URL, set it to:
|
||||
#: ignore_duplicate_articles = {'title', 'url'}
|
||||
ignore_duplicate_articles = None
|
||||
|
||||
# See the built-in profiles for examples of these settings.
|
||||
|
||||
@ -1019,6 +1029,28 @@ class BasicNewsRecipe(Recipe):
|
||||
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
||||
return self._fetch_article(url, dir, f, a, num_of_feeds)
|
||||
|
||||
def remove_duplicate_articles(self, feeds):
|
||||
seen_keys = defaultdict(set)
|
||||
remove = []
|
||||
for f in feeds:
|
||||
for article in f:
|
||||
for key in self.ignore_duplicate_articles:
|
||||
val = getattr(article, key)
|
||||
seen = seen_keys[key]
|
||||
if val:
|
||||
if val in seen:
|
||||
remove.append((f, article))
|
||||
else:
|
||||
seen.add(val)
|
||||
|
||||
for feed, article in remove:
|
||||
self.log.debug('Removing duplicate article: %s from section: %s'%(
|
||||
article.title, feed.title))
|
||||
feed.remove_article(article)
|
||||
|
||||
if self.remove_empty_feeds:
|
||||
feeds = [f for f in feeds if len(f) > 0]
|
||||
return feeds
|
||||
|
||||
def build_index(self):
|
||||
self.report_progress(0, _('Fetching feeds...'))
|
||||
@ -1033,6 +1065,9 @@ class BasicNewsRecipe(Recipe):
|
||||
if not feeds:
|
||||
raise ValueError('No articles found, aborting')
|
||||
|
||||
if self.ignore_duplicate_articles is not None:
|
||||
feeds = self.remove_duplicate_articles(feeds)
|
||||
|
||||
#feeds = FeedCollection(feeds)
|
||||
|
||||
self.report_progress(0, _('Trying to download cover...'))
|
||||
|
@ -68,7 +68,12 @@ def serialize_collection(mapping_of_recipe_classes):
|
||||
key=lambda key: force_unicode(
|
||||
getattr(mapping_of_recipe_classes[key], 'title', 'zzz'),
|
||||
'utf-8')):
|
||||
try:
|
||||
recipe = serialize_recipe(urn, mapping_of_recipe_classes[urn])
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
continue
|
||||
collection.append(recipe)
|
||||
collection.set('count', str(len(collection)))
|
||||
return etree.tostring(collection, encoding='utf-8', xml_declaration=True,
|
||||
|
Loading…
x
Reference in New Issue
Block a user