mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
fix pyflakes errors
This commit is contained in:
parent
42b2eeb3bc
commit
8f69acf446
@ -1,4 +1,3 @@
|
|||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class JerusalemPost(BasicNewsRecipe):
|
class JerusalemPost(BasicNewsRecipe):
|
||||||
@ -14,31 +13,10 @@ class JerusalemPost(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 10
|
max_articles_per_feed = 10
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
feeds = [ ('Front Page', 'http://www.jpost.com/Rss/RssFeedsFrontPage.aspx'),
|
feeds = [('Front Page', 'http://www.jpost.com/Rss/RssFeedsFrontPage.aspx'),
|
||||||
('Israel News', 'http://www.jpost.com/Rss/RssFeedsIsraelNews.aspx'),
|
('Israel News', 'http://www.jpost.com/Rss/RssFeedsIsraelNews.aspx'),
|
||||||
('Middle East News', 'http://www.jpost.com/Rss/RssFeedsMiddleEastNews.aspx'),
|
('Middle East News', 'http://www.jpost.com/Rss/RssFeedsMiddleEastNews.aspx'),
|
||||||
('International News', 'http://www.jpost.com/Rss/RssFeedsInternationalNews.aspx'),
|
('International News', 'http://www.jpost.com/Rss/RssFeedsInternationalNews.aspx'),
|
||||||
('Editorials', 'http://www.jpost.com/Rss/RssFeedsEditorialsNews.aspx'),
|
('Editorials', 'http://www.jpost.com/Rss/RssFeedsEditorialsNews.aspx'),
|
||||||
]
|
]
|
||||||
|
|
||||||
#remove_tags = [
|
|
||||||
#dict(id=lambda x: x and 'ads.' in x),
|
|
||||||
#dict(attrs={'class':['printinfo', 'tt1']}),
|
|
||||||
#dict(onclick='DoPrint()'),
|
|
||||||
#dict(name='input'),
|
|
||||||
#]
|
|
||||||
|
|
||||||
#conversion_options = {'linearize_tables':True}
|
|
||||||
|
|
||||||
#def preprocess_html(self, soup):
|
|
||||||
#for tag in soup.findAll('form'):
|
|
||||||
#tag.name = 'div'
|
|
||||||
#return soup
|
|
||||||
|
|
||||||
#def print_version(self, url):
|
|
||||||
#m = re.search(r'(ID|id)=(\d+)', url)
|
|
||||||
#if m is not None:
|
|
||||||
#id_ = m.group(2)
|
|
||||||
#return 'http://www.jpost.com/LandedPages/PrintArticle.aspx?id=%s'%id_
|
|
||||||
#return url
|
|
||||||
|
|
||||||
|
@ -9,9 +9,7 @@ __description__ = 'PCMag (www.pcmag.com) delivers authoritative, labs-based comp
|
|||||||
'''
|
'''
|
||||||
http://www.pcmag.com/
|
http://www.pcmag.com/
|
||||||
'''
|
'''
|
||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Comment
|
|
||||||
|
|
||||||
class pcMag(BasicNewsRecipe):
|
class pcMag(BasicNewsRecipe):
|
||||||
__author__ = 'Lorenzo Vigentini'
|
__author__ = 'Lorenzo Vigentini'
|
||||||
|
@ -8,7 +8,7 @@ __copyright__ = 'tomashnyk@gmail.com'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
#This imports the version bundled with Calibre
|
#This imports the version bundled with Calibre
|
||||||
import lxml
|
import lxml
|
||||||
from lxml.builder import E
|
from lxml.builder import E
|
||||||
@ -34,7 +34,7 @@ class respektRecipe(BasicNewsRecipe):
|
|||||||
dict(name='p', attrs={'class':['detail-vykrik']}), \
|
dict(name='p', attrs={'class':['detail-vykrik']}), \
|
||||||
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in preprocess requires this
|
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in preprocess requires this
|
||||||
dict(name='strong', attrs={'class':['detail-vykrik']}),
|
dict(name='strong', attrs={'class':['detail-vykrik']}),
|
||||||
dict(name='script')]
|
dict(name='script')]
|
||||||
# this makes authors left-aligned by not using the author class)
|
# this makes authors left-aligned by not using the author class)
|
||||||
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
|
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
|
||||||
# remove empty tags
|
# remove empty tags
|
||||||
@ -48,7 +48,7 @@ class respektRecipe(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup('http://respekt.ihned.cz/')
|
soup = self.index_to_soup('http://respekt.ihned.cz/')
|
||||||
cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src']
|
cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src']
|
||||||
return cover
|
return cover
|
||||||
|
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
@ -82,14 +82,14 @@ class respektRecipe(BasicNewsRecipe):
|
|||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1')
|
self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1')
|
||||||
|
|
||||||
|
|
||||||
def preprocess_html(self,soup):
|
def preprocess_html(self,soup):
|
||||||
raw = u''.join(unicode(a) for a in soup.contents)
|
raw = u''.join(unicode(a) for a in soup.contents)
|
||||||
root = lxml.html.fromstring(raw)
|
root = lxml.html.fromstring(raw)
|
||||||
|
|
||||||
# Make image captions visible
|
# Make image captions visible
|
||||||
body = root.xpath("//div[@id='text']")[0]
|
body = root.xpath("//div[@id='text']")[0]
|
||||||
add = 0
|
add = 0
|
||||||
for index, element in enumerate(body):
|
for index, element in enumerate(body):
|
||||||
try:
|
try:
|
||||||
if element.tag == 'img':
|
if element.tag == 'img':
|
||||||
@ -104,7 +104,7 @@ class respektRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Make perex (subheading) start on a new line
|
# Make perex (subheading) start on a new line
|
||||||
root.xpath("//h1")[0].append(E.br(''))
|
root.xpath("//h1")[0].append(E.br(''))
|
||||||
|
|
||||||
# Indent paragraphs when typographically suitable
|
# Indent paragraphs when typographically suitable
|
||||||
parse = True
|
parse = True
|
||||||
# There are only single paragraphs in these sections
|
# There are only single paragraphs in these sections
|
||||||
|
@ -8,9 +8,9 @@ __copyright__ = 'tomashnyk@gmail.com'
|
|||||||
|
|
||||||
import re,os,datetime
|
import re,os,datetime
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.constants import config_dir, CONFIG_DIR_MODE
|
from calibre.constants import config_dir
|
||||||
#This imports the version bundled with Calibre
|
# This imports the version bundled with Calibre
|
||||||
import lxml
|
import lxml
|
||||||
from lxml.builder import E
|
from lxml.builder import E
|
||||||
|
|
||||||
@ -30,13 +30,13 @@ class respektWebRecipe(BasicNewsRecipe):
|
|||||||
p.indent_first_line {text-indent:30px;}'
|
p.indent_first_line {text-indent:30px;}'
|
||||||
remove_tags_before = dict(name='div',attrs={'class':['l']})
|
remove_tags_before = dict(name='div',attrs={'class':['l']})
|
||||||
remove_tags_after = dict(id='text')
|
remove_tags_after = dict(id='text')
|
||||||
remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
|
remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}),
|
||||||
dict(name='div',attrs={'class':['slot','reklama','date']}), \
|
dict(name='div',attrs={'class':['slot','reklama','date']}),
|
||||||
dict(name='span', attrs={'class':['detail-vykrik']}), \
|
dict(name='span', attrs={'class':['detail-vykrik']}),
|
||||||
dict(name='p', attrs={'class':['detail-vykrik']}), \
|
dict(name='p', attrs={'class':['detail-vykrik']}),
|
||||||
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in prprocess requires this
|
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in prprocess requires this
|
||||||
dict(name='strong', attrs={'class':['detail-vykrik']}),
|
dict(name='strong', attrs={'class':['detail-vykrik']}),
|
||||||
dict(name='script')]
|
dict(name='script')]
|
||||||
# this makes authors left-aligned by not using the author class)
|
# this makes authors left-aligned by not using the author class)
|
||||||
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
|
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
|
||||||
# remove empty tags
|
# remove empty tags
|
||||||
@ -45,17 +45,17 @@ class respektWebRecipe(BasicNewsRecipe):
|
|||||||
preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||||
preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||||
preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# Read already downloaded articles
|
# Read already downloaded articles
|
||||||
recipe_dir = os.path.join(config_dir,'recipes')
|
recipe_dir = os.path.join(config_dir,'recipes')
|
||||||
old_articles = os.path.join(recipe_dir,self.title.encode('utf-8').replace('/',':'))
|
old_articles = os.path.join(recipe_dir,self.title)
|
||||||
past_items = []
|
past_items = []
|
||||||
if os.path.exists(old_articles):
|
if os.path.exists(old_articles):
|
||||||
with file(old_articles) as f:
|
with file(old_articles) as f:
|
||||||
for h in f:
|
for h in f:
|
||||||
l = h.strip().split(" ")
|
l = h.strip().split(" ")
|
||||||
past_items.append((l[0]," ".join(l[1:])))
|
past_items.append((l[0]," ".join(l[1:])))
|
||||||
old_urls = [x[0] for x in past_items]
|
old_urls = [x[0] for x in past_items]
|
||||||
count_items = {}
|
count_items = {}
|
||||||
current_items = []
|
current_items = []
|
||||||
@ -112,7 +112,7 @@ class respektWebRecipe(BasicNewsRecipe):
|
|||||||
if section[1] == 'Respekt DJ':
|
if section[1] == 'Respekt DJ':
|
||||||
if list_of_articles:
|
if list_of_articles:
|
||||||
if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour < 17:
|
if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour < 17:
|
||||||
#list_of_articles = list_of_articles[:-1]
|
# list_of_articles = list_of_articles[:-1]
|
||||||
current_items = current_items[:-1]
|
current_items = current_items[:-1]
|
||||||
if list_of_articles:
|
if list_of_articles:
|
||||||
ans.append((section[1],list_of_articles))
|
ans.append((section[1],list_of_articles))
|
||||||
@ -131,7 +131,7 @@ class respektWebRecipe(BasicNewsRecipe):
|
|||||||
root = lxml.html.fromstring(raw)
|
root = lxml.html.fromstring(raw)
|
||||||
# Make image captions visible
|
# Make image captions visible
|
||||||
body = root.xpath("//div[@id='text']")[0]
|
body = root.xpath("//div[@id='text']")[0]
|
||||||
add = 0
|
add = 0
|
||||||
for index, element in enumerate(body):
|
for index, element in enumerate(body):
|
||||||
try:
|
try:
|
||||||
if element.tag == 'img':
|
if element.tag == 'img':
|
||||||
@ -146,17 +146,17 @@ class respektWebRecipe(BasicNewsRecipe):
|
|||||||
pass
|
pass
|
||||||
# For DJ, the perex is always the same, so remove it
|
# For DJ, the perex is always the same, so remove it
|
||||||
if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ':
|
if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ':
|
||||||
|
|
||||||
perex = root.xpath("//div[@id='perex']")[0]
|
perex = root.xpath("//div[@id='perex']")[0]
|
||||||
clean = root.xpath("//div[@class='clean']")[0]
|
clean = root.xpath("//div[@class='clean']")[0]
|
||||||
perex.getparent().remove(perex)
|
perex.getparent().remove(perex)
|
||||||
clean.getparent().remove(clean)
|
clean.getparent().remove(clean)
|
||||||
|
|
||||||
# DJ section gets mal-formatted on kindle otherwise
|
# DJ section gets mal-formatted on kindle otherwise
|
||||||
for i in root.xpath("//h2[@class='d-dj-t']"):
|
for i in root.xpath("//h2[@class='d-dj-t']"):
|
||||||
i.attrib['class'] = ''
|
i.attrib['class'] = ''
|
||||||
E.style = "font-size:60%;font-weight:normal;"
|
E.style = "font-size:60%;font-weight:normal;"
|
||||||
time = E('span',i.getprevious().text_content(),style = E.style)
|
time = E('span',i.getprevious().text_content(),style=E.style)
|
||||||
# Time should be ahead of the title
|
# Time should be ahead of the title
|
||||||
time.tail = ' ' + i.text
|
time.tail = ' ' + i.text
|
||||||
i.text = ''
|
i.text = ''
|
||||||
@ -176,13 +176,13 @@ class respektWebRecipe(BasicNewsRecipe):
|
|||||||
dj_body = entries[0].getparent()
|
dj_body = entries[0].getparent()
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
dj_body.remove(entry)
|
dj_body.remove(entry)
|
||||||
dj_body.append(entry)
|
dj_body.append(entry)
|
||||||
|
|
||||||
# We are not interested in this paragraph as it stays the same and is essentialy an ad
|
# We are not interested in this paragraph as it stays the same and is essentialy an ad
|
||||||
if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz':
|
if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz':
|
||||||
ad = root.xpath("//p[@id='ajmonf']")[0]
|
ad = root.xpath("//p[@id='ajmonf']")[0]
|
||||||
ad.getparent().remove(ad)
|
ad.getparent().remove(ad)
|
||||||
|
|
||||||
# Add length of the articles in words after author
|
# Add length of the articles in words after author
|
||||||
article_length = str(len(body.text_content().split(' '))) + ' slov'
|
article_length = str(len(body.text_content().split(' '))) + ' slov'
|
||||||
root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
|
root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user