mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
149 lines
6.9 KiB
Python
149 lines
6.9 KiB
Python
#!/usr/bin/python2
|
|
# -*- coding: utf-8 -*-
|
|
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
|
# Copyright: tomashnyk@gmail.com
|
|
|
|
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
|
__copyright__ = 'tomashnyk@gmail.com'
|
|
|
|
import re
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
#This imports the version bundled with Calibre
|
|
import lxml
|
|
from lxml.builder import E
|
|
|
|
class respektRecipe(BasicNewsRecipe):
|
|
__author__ = u'Tomáš Hnyk'
|
|
title = u'Respekt - Magazine'
|
|
publisher = u'Respekt Publishing a. s.'
|
|
description = u'Articles from the printed edition, password needed for full access'
|
|
encoding = 'cp1250'
|
|
language = 'cs'
|
|
remove_javascript = True
|
|
extra_css = 'p {text-align:justify} \
|
|
ul {color:black} \
|
|
.image_caption {font-size:50%;font-style:italic;} \
|
|
.author {text-align:left;} \
|
|
p.indent_first_line {text-indent:30px;}'
|
|
remove_tags_before = dict(name='div',attrs={'class':['l']})
|
|
remove_tags_after = dict(id='text')
|
|
remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
|
|
dict(name='div',attrs={'class':['slot','reklama','date']}), \
|
|
dict(name='span', attrs={'class':['detail-vykrik']}), \
|
|
dict(name='p', attrs={'class':['detail-vykrik']}), \
|
|
dict(name='div', attrs={'id':['col123d-video','col123d-infographic','col123d-gallery','col12d-discussion']}), # soup>lxml>soup in preprocess requires this
|
|
dict(name='strong', attrs={'class':['detail-vykrik']}),
|
|
dict(name='script')]
|
|
# this makes authors left-aligned by not using the author class)
|
|
preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
|
|
# remove empty tags
|
|
preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
|
|
preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
|
|
preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
|
preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
|
preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
|
|
|
|
def get_cover_url(self):
|
|
soup = self.index_to_soup('http://respekt.ihned.cz/')
|
|
cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src']
|
|
return cover
|
|
|
|
needs_subscription = True
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
if self.username is not None and self.password is not None:
|
|
br.open('http://muj-ucet.ihned.cz/')
|
|
br.select_form(name='login')
|
|
br['login[nick]'] = self.username
|
|
br['login[pass]'] = self.password
|
|
br.submit()
|
|
return br
|
|
|
|
def parse_index(self):
|
|
raw = self.index_to_soup('http://respekt.ihned.cz/aktualni-cislo/', raw=True)
|
|
root = lxml.html.fromstring(raw)
|
|
ans = []
|
|
for article in root.xpath("//div[@class='ow-enclose']/div[@class='ow']"):
|
|
section_title = article.xpath(".//span[text()='(rubrika: ']")[0].find("a").text
|
|
date = article.xpath("span[@class='date-author']")[0].text[:-3]
|
|
title = article.find("h2").find("a").text
|
|
url = article.find('h2').find('a').get('href')
|
|
link = {'title':title,'url':url,'date':date}
|
|
for section in ans:
|
|
if section[0] == section_title:
|
|
section[1].append(link)
|
|
break
|
|
else:
|
|
ans.append((section_title,[link]))
|
|
return ans
|
|
|
|
def cleanup(self):
|
|
self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1')
|
|
|
|
|
|
def preprocess_html(self,soup):
|
|
raw = u''.join(unicode(a) for a in soup.contents)
|
|
root = lxml.html.fromstring(raw)
|
|
|
|
# Make image captions visible
|
|
body = root.xpath("//div[@id='text']")[0]
|
|
add = 0
|
|
for index, element in enumerate(body):
|
|
try:
|
|
if element.tag == 'img':
|
|
body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
|
|
add += 1
|
|
except:
|
|
pass
|
|
|
|
# Add length of the articles in words after author
|
|
article_length = str(len(body.text_content().split(' '))) + ' slov'
|
|
root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
|
|
|
|
# Make perex (subheading) start on a new line
|
|
root.xpath("//h1")[0].append(E.br(''))
|
|
|
|
# Indent paragraphs when typographically suitable
|
|
parse = True
|
|
# There are only single paragraphs in these sections
|
|
if root.xpath("//title")[0].text == u"Deset českých zpráv, které by vás neměly minout | Deset českých zpráv - RESPEKT.IHNED.CZ":
|
|
parse = False
|
|
if root.xpath("//title")[0].text == u"Deset zahraničních zpráv, které by vás neměly minout | Deset světových zpráv - RESPEKT.IHNED.CZ":
|
|
parse = False
|
|
if parse:
|
|
# First paragraph is never indented
|
|
paragraphs = root.xpath('//p')
|
|
# Clear the formatting a little bit by removing these attributes
|
|
for par in paragraphs:
|
|
if 'class' in par.keys():
|
|
if par.attrib['class'] == 'detail-odstavec':
|
|
par.attrib.pop('class')
|
|
paragraphs.reverse()
|
|
for par in paragraphs[:-1]:
|
|
try:
|
|
# <strong> in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
|
|
if len(par) > 0:
|
|
if (par.text is None and par.getchildren()[0].tag == 'strong'):
|
|
continue
|
|
elif par.getprevious().text == u'\u2026':
|
|
continue
|
|
indent = False
|
|
# Either indent if the paragraphs are the same
|
|
if par.getprevious().attrib == par.attrib:
|
|
indent = True
|
|
# Or else if the first paragraph of the text was special
|
|
if 'class' in par.getprevious().keys():
|
|
par_name = par.getprevious().attrib['class']
|
|
if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
|
|
indent = True
|
|
if indent:
|
|
for key in par.keys():
|
|
par.attrib.pop(key)
|
|
par.attrib['class']="indent_first_line"
|
|
except:
|
|
pass
|
|
|
|
return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))
|