diff --git a/recipes/jpost.recipe b/recipes/jpost.recipe index 6fa1db1e1f..4cdfc4cd12 100644 --- a/recipes/jpost.recipe +++ b/recipes/jpost.recipe @@ -1,4 +1,3 @@ -import re from calibre.web.feeds.news import BasicNewsRecipe class JerusalemPost(BasicNewsRecipe): @@ -14,31 +13,10 @@ class JerusalemPost(BasicNewsRecipe): max_articles_per_feed = 10 no_stylesheets = True - feeds = [ ('Front Page', 'http://www.jpost.com/Rss/RssFeedsFrontPage.aspx'), + feeds = [('Front Page', 'http://www.jpost.com/Rss/RssFeedsFrontPage.aspx'), ('Israel News', 'http://www.jpost.com/Rss/RssFeedsIsraelNews.aspx'), ('Middle East News', 'http://www.jpost.com/Rss/RssFeedsMiddleEastNews.aspx'), ('International News', 'http://www.jpost.com/Rss/RssFeedsInternationalNews.aspx'), ('Editorials', 'http://www.jpost.com/Rss/RssFeedsEditorialsNews.aspx'), ] - #remove_tags = [ - #dict(id=lambda x: x and 'ads.' in x), - #dict(attrs={'class':['printinfo', 'tt1']}), - #dict(onclick='DoPrint()'), - #dict(name='input'), - #] - - #conversion_options = {'linearize_tables':True} - - #def preprocess_html(self, soup): - #for tag in soup.findAll('form'): - #tag.name = 'div' - #return soup - - #def print_version(self, url): - #m = re.search(r'(ID|id)=(\d+)', url) - #if m is not None: - #id_ = m.group(2) - #return 'http://www.jpost.com/LandedPages/PrintArticle.aspx?id=%s'%id_ - #return url - diff --git a/recipes/pc_mag.recipe b/recipes/pc_mag.recipe index fd7068cc45..1ba52ccd88 100644 --- a/recipes/pc_mag.recipe +++ b/recipes/pc_mag.recipe @@ -9,9 +9,7 @@ __description__ = 'PCMag (www.pcmag.com) delivers authoritative, labs-based comp ''' http://www.pcmag.com/ ''' -import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Comment class pcMag(BasicNewsRecipe): __author__ = 'Lorenzo Vigentini' diff --git a/recipes/respekt_magazine.recipe b/recipes/respekt_magazine.recipe index d4deb1d6e4..a16619dd83 100644 --- a/recipes/respekt_magazine.recipe +++ b/recipes/respekt_magazine.recipe @@ -8,7 +8,7 @@ __copyright__ = 'tomashnyk@gmail.com' import re from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup #This imports the version bundled with Calibre import lxml from lxml.builder import E @@ -34,7 +34,7 @@ class respektRecipe(BasicNewsRecipe): dict(name='p', attrs={'class':['detail-vykrik']}), \ dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in preprocess requires this dict(name='strong', attrs={'class':['detail-vykrik']}), - dict(name='script')] + dict(name='script')] # this makes authors left-aligned by not using the author class) preprocess_regexps = [(re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: '
')] # remove empty tags @@ -48,7 +48,7 @@ class respektRecipe(BasicNewsRecipe): soup = self.index_to_soup('http://respekt.ihned.cz/') cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src'] return cover - + needs_subscription = True def get_browser(self): @@ -82,14 +82,14 @@ class respektRecipe(BasicNewsRecipe): def cleanup(self): self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1') - + def preprocess_html(self,soup): raw = u''.join(unicode(a) for a in soup.contents) root = lxml.html.fromstring(raw) # Make image captions visible body = root.xpath("//div[@id='text']")[0] - add = 0 + add = 0 for index, element in enumerate(body): try: if element.tag == 'img': @@ -104,7 +104,7 @@ class respektRecipe(BasicNewsRecipe): # Make perex (subheading) start on a new line root.xpath("//h1")[0].append(E.br('')) - + # Indent paragraphs when typographically suitable parse = True # There are only single paragraphs in these sections diff --git a/recipes/respekt_web.recipe b/recipes/respekt_web.recipe index 0ddf21c327..1e046a4b7e 100644 --- a/recipes/respekt_web.recipe +++ b/recipes/respekt_web.recipe @@ -8,9 +8,9 @@ __copyright__ = 'tomashnyk@gmail.com' import re,os,datetime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag -from calibre.constants import config_dir, CONFIG_DIR_MODE -#This imports the version bundled with Calibre +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.constants import config_dir +# This imports the version bundled with Calibre import lxml from lxml.builder import E @@ -30,13 +30,13 @@ class respektWebRecipe(BasicNewsRecipe): p.indent_first_line {text-indent:30px;}' remove_tags_before = dict(name='div',attrs={'class':['l']}) remove_tags_after = dict(id='text') - remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \ - dict(name='div',attrs={'class':['slot','reklama','date']}), \ - dict(name='span', attrs={'class':['detail-vykrik']}), \ - dict(name='p', attrs={'class':['detail-vykrik']}), \ + remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), + dict(name='div',attrs={'class':['slot','reklama','date']}), + dict(name='span', attrs={'class':['detail-vykrik']}), + dict(name='p', attrs={'class':['detail-vykrik']}), dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in prprocess requires this dict(name='strong', attrs={'class':['detail-vykrik']}), - dict(name='script')] + dict(name='script')] # this makes authors left-aligned by not using the author class) preprocess_regexps = [(re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: '
')] # remove empty tags @@ -45,17 +45,17 @@ class respektWebRecipe(BasicNewsRecipe): preprocess_regexps.append((re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: '')) preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: '')) preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: '')) - + def parse_index(self): # Read already downloaded articles recipe_dir = os.path.join(config_dir,'recipes') - old_articles = os.path.join(recipe_dir,self.title.encode('utf-8').replace('/',':')) + old_articles = os.path.join(recipe_dir,self.title) past_items = [] if os.path.exists(old_articles): - with file(old_articles) as f: - for h in f: - l = h.strip().split(" ") - past_items.append((l[0]," ".join(l[1:]))) + with file(old_articles) as f: + for h in f: + l = h.strip().split(" ") + past_items.append((l[0]," ".join(l[1:]))) old_urls = [x[0] for x in past_items] count_items = {} current_items = [] @@ -112,7 +112,7 @@ class respektWebRecipe(BasicNewsRecipe): if section[1] == 'Respekt DJ': if list_of_articles: if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour < 17: - #list_of_articles = list_of_articles[:-1] + # list_of_articles = list_of_articles[:-1] current_items = current_items[:-1] if list_of_articles: ans.append((section[1],list_of_articles)) @@ -131,7 +131,7 @@ class respektWebRecipe(BasicNewsRecipe): root = lxml.html.fromstring(raw) # Make image captions visible body = root.xpath("//div[@id='text']")[0] - add = 0 + add = 0 for index, element in enumerate(body): try: if element.tag == 'img': @@ -146,17 +146,17 @@ class respektWebRecipe(BasicNewsRecipe): pass # For DJ, the perex is always the same, so remove it if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ': - + perex = root.xpath("//div[@id='perex']")[0] clean = root.xpath("//div[@class='clean']")[0] perex.getparent().remove(perex) clean.getparent().remove(clean) - + # DJ section gets mal-formatted on kindle otherwise for i in root.xpath("//h2[@class='d-dj-t']"): i.attrib['class'] = '' E.style = "font-size:60%;font-weight:normal;" - time = E('span',i.getprevious().text_content(),style = E.style) + time = E('span',i.getprevious().text_content(),style=E.style) # Time should be ahead of the title time.tail = ' ' + i.text i.text = '' @@ -176,13 +176,13 @@ class respektWebRecipe(BasicNewsRecipe): dj_body = entries[0].getparent() for entry in entries: dj_body.remove(entry) - dj_body.append(entry) + dj_body.append(entry) # We are not interested in this paragraph as it stays the same and is essentialy an ad if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz': ad = root.xpath("//p[@id='ajmonf']")[0] ad.getparent().remove(ad) - + # Add length of the articles in words after author article_length = str(len(body.text_content().split(' '))) + ' slov' root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))