')]
+ # remove empty tags
+ preprocess_regexps.append((re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: ' '))
+ preprocess_regexps.append((re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: ' '))
+ preprocess_regexps.append((re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: ''))
+ preprocess_regexps.append((re.compile(r'font-size: 12px', re.DOTALL|re.IGNORECASE), lambda match: ''))
+ preprocess_regexps.append((re.compile(r'color: #[0-9]*', re.DOTALL|re.IGNORECASE), lambda match: ''))
+
+ def parse_index(self):
+ # Read already downloaded articles
+ recipe_dir = os.path.join(config_dir,'recipes')
+ old_articles = os.path.join(recipe_dir,self.title.encode('utf-8').replace('/',':'))
+ past_items = []
+ if os.path.exists(old_articles):
+ with file(old_articles) as f:
+ for h in f:
+ l = h.strip().split(" ")
+ past_items.append((l[0]," ".join(l[1:])))
+ old_urls = [x[0] for x in past_items]
+ count_items = {}
+ current_items = []
+ # Keep a list of only 20 latest articles for each section
+ past_items.reverse()
+ for item in past_items:
+ if item[1] in count_items.keys():
+ if count_items[item[1]] < 20:
+ count_items[item[1]] += 1
+ current_items.append(item)
+ else:
+ count_items[item[1]] = 1
+ current_items.append(item)
+ current_items.reverse()
+
+ sections = []
+ # Get the webpages to download lists of articles from
+ raw = self.index_to_soup('http://respekt.ihned.cz/sloupky-redaktoru/', raw=True)
+ root = lxml.html.fromstring(raw)
+ sections = []
+ for section in root.xpath("//div[@class='ow-enclose sr']/table/tr/td"):
+ try:
+ url = section.find('a').get('href')
+ if not ('?m=authors&person[id]=' in url):
+ sections.append((url,section.find('a').find('b').text))
+ except:
+ pass
+ sections.append(('http://respekt.ihned.cz/respekt-dj/','Respekt DJ'))
+ sections.append(('http://respekt.ihned.cz/fokus/','Fokus'))
+ sections.append(('http://respekt.ihned.cz/respekt-hub/','Respekt Hub'))
+ sections.append(('http://respekt.ihned.cz/rozhovory/','Rozhovory'))
+ sections.append(('http://respekt.ihned.cz/glosy/','Glosy'))
+
+ # Get the list of articles
+ ans = []
+ for section in sections:
+ raw = self.index_to_soup(section[0], raw=True)
+ root = lxml.html.fromstring(raw)
+ list_of_articles = []
+ articles = root.xpath("//div[@class='ow-enclose']/div[@class='ow']")
+ # Sort the articles in a section from oldest to newest
+ articles.reverse()
+ for article in articles:
+ date = getattr(article.xpath("span[@class='date-author']")[0],'text','')[:-3]
+ author = getattr(article.xpath("span[@class='date-author']")[0].find("a"),'text','')
+ title = getattr(article.find("h2").find("a"),'text','')
+ url = article.find('h2').find('a').get('href')
+ # Only download new articles
+ if url not in old_urls:
+ old_urls.append(url)
+ current_items.append((url,section[1]))
+ list_of_articles.append({'title':title,'url':url,'date':date,'author':author})
+ # Redownload this page next time if it is still being updated (between 7 and 17 GMT generally, so make the limits a little bit bigger):wq
+ if section[1] == 'Respekt DJ':
+ if list_of_articles:
+ if datetime.datetime.today().weekday() in range(0,5) and 6 < datetime.datetime.utcnow().hour < 17:
+ #list_of_articles = list_of_articles[:-1]
+ current_items = current_items[:-1]
+ if list_of_articles:
+ ans.append((section[1],list_of_articles))
+ # Write already downloaded articles
+ with file(old_articles,'w') as f:
+ f.write('\n'.join('{} {}'.format(*x) for x in current_items))
+ return ans
+
+ # For some reason, the following does not work:
+ # preprocess_regexps.append((re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: '
'))
+ def preprocess_raw_html(self, raw_html, url):
+ return re.sub("
","
",raw_html)
+
+ def preprocess_html(self,soup):
+ raw = u''.join(unicode(a) for a in soup.contents)
+ root = lxml.html.fromstring(raw)
+ # Make image captions visible
+ body = root.xpath("//div[@id='text']")[0]
+ add = 0
+ for index, element in enumerate(body):
+ try:
+ if element.tag == 'img':
+ body.insert(index+add+1,E.p(element.get('title'),{"class":"image_caption"}))
+ add += 1
+ except:
+ pass
+ # Make captions visible on the website have the same style
+ try:
+ root.xpath("//div[@class='hlavni-obrazek-popis']")[0].attrib['class'] = 'image_caption'
+ except:
+ pass
+ # For DJ, the perex is always the same, so remove it
+ if root.xpath("//title")[0].text.split("|")[-1] == u' Respekt DJ - RESPEKT.CZ':
+
+ perex = root.xpath("//div[@id='perex']")[0]
+ clean = root.xpath("//div[@class='clean']")[0]
+ perex.getparent().remove(perex)
+ clean.getparent().remove(clean)
+
+ # DJ section gets mal-formatted on kindle otherwise
+ for i in root.xpath("//h2[@class='d-dj-t']"):
+ i.attrib['class'] = ''
+ E.style = "font-size:60%;font-weight:normal;"
+ time = E('span',i.getprevious().text_content(),style = E.style)
+ # Time should be ahead of the title
+ time.tail = ' ' + i.text
+ i.text = ''
+ i.insert(0,time)
+ for i in root.xpath("//div[@class='d-dj-d']"):
+ i.attrib['class'] = ''
+ i.xpath("div/span")[0].text = ''
+ for i in root.xpath("//div[@class='d-dj-b']"):
+ i.attrib['class'] = ''
+
+ # Make captions visible on the website have the same style
+ root.xpath("//div[@class='hlavni-obrazekDJ-popis']")[0].attrib['class'] = 'image_caption'
+
+ # Reverse the entries so that the earliest are at the top
+ entries = root.xpath("//div[@class='d-dj-i']")
+ entries.reverse()
+ dj_body = entries[0].getparent()
+ for entry in entries:
+ dj_body.remove(entry)
+ dj_body.append(entry)
+
+ # We are not interested in this paragraph as it stays the same and is essentialy an ad
+ if root.xpath("//title")[0].text.split("|")[-1] == u' Audit Jana Macháčka - Respekt.iHNed.cz':
+ ad = root.xpath("//p[@id='ajmonf']")[0]
+ ad.getparent().remove(ad)
+
+ # Add length of the articles in words after author
+ article_length = str(len(body.text_content().split(' '))) + ' slov'
+ root.xpath("//div[@class='author-image']/div[@class='']/ul")[0].append(E.li(article_length))
+
+ # Make perex (subheading) start on a new line
+ root.xpath("//h1")[0].append(E.br(''))
+
+ # Indent paragraphs when typographically suitable
+ # First paragraph is never indented
+ paragraphs = root.xpath('//p')
+ # Clear the formatting a little bit by removing these attributes
+ for par in paragraphs:
+ if 'class' in par.keys():
+ if par.attrib['class'] == 'detail-odstavec':
+ par.attrib.pop('class')
+ paragraphs.reverse()
+ for par in paragraphs[:-1]:
+ try:
+ # in the beginning of this paragraph means no indenting as well as ellipses as the only text in paragraph
+ if len(par) > 0:
+ if (par.text is None and par.getchildren()[0].tag == 'strong'):
+ continue
+ elif par.getprevious().text == u'\u2026':
+ continue
+ indent = False
+ # Either indent if the paragraphs are the same
+ if par.getprevious().attrib == par.attrib:
+ indent = True
+ # Or else if the first paragraph of the text was special
+ if 'class' in par.getprevious().keys():
+ par_name = par.getprevious().attrib['class']
+ if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
+ indent = True
+ if indent:
+ for key in par.keys():
+ par.attrib.pop(key)
+ par.attrib['class']="indent_first_line"
+ except:
+ pass
+ return(BeautifulSoup(lxml.etree.tostring(root,encoding=unicode)))