mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Hurriyet Daily News by spswerling
This commit is contained in:
parent
2b23b0d342
commit
92cfab55a1
260
recipes/hurriyet_daily_news.recipe
Normal file
260
recipes/hurriyet_daily_news.recipe
Normal file
@ -0,0 +1,260 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2014, spswerling'
|
||||||
|
'''
|
||||||
|
www.hurriyetdailynews.com
|
||||||
|
'''
|
||||||
|
import os, string, inspect, datetime, re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
class HurriyetDailyNews_en(BasicNewsRecipe):
|
||||||
|
title = u'Hurriyet Daily News'
|
||||||
|
__author__ = u'spswerling'
|
||||||
|
description = 'a Turkey based daily in english'
|
||||||
|
description = 'English version of Turkish Daily "Hurriyet"'
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
category = 'news'
|
||||||
|
language = 'en_TR'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png'
|
||||||
|
masthead_url = cover_img_url
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
# on kindle, images can make things kind of fat. Slim them down.
|
||||||
|
recursions = 0
|
||||||
|
oldest_article = 1
|
||||||
|
compress_news_images = True
|
||||||
|
compress_news_images_max_size = 7
|
||||||
|
scale_news_images = (150,200) # (kindle touch: 600x800)
|
||||||
|
useHighResImages = False
|
||||||
|
oldest_article = 1.5
|
||||||
|
max_articles_per_section = 25
|
||||||
|
max_articles_per_subsection = 7
|
||||||
|
|
||||||
|
sections = [
|
||||||
|
u'turkey',
|
||||||
|
u'economy',
|
||||||
|
u'world',
|
||||||
|
u'sports',
|
||||||
|
# u'life',
|
||||||
|
u'opinion',
|
||||||
|
# u'arts/culture'
|
||||||
|
]
|
||||||
|
|
||||||
|
# util for creating remove_tags and keep_tags style regex matchers
|
||||||
|
def tag_matcher(elt, attr, str):
|
||||||
|
return dict(name=elt, attrs={attr:re.compile(str, re.IGNORECASE)})
|
||||||
|
|
||||||
|
keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
tag_matcher('div', 'class', 'Carousel'),
|
||||||
|
tag_matcher('div', 'class', 'ShareIt'),
|
||||||
|
tag_matcher('div', 'class', 'tmz'),
|
||||||
|
tag_matcher('span', 'id', 'comment'),
|
||||||
|
tag_matcher('h2', 'class', 'NewSpot'),
|
||||||
|
tag_matcher('h2', 'class', 'pv-gallery'),
|
||||||
|
]
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
subsection_links = {}
|
||||||
|
urls_done = []
|
||||||
|
links_per_section = {}
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
section_links = self.section_links_from_home_page()
|
||||||
|
for section_link in section_links:
|
||||||
|
self.articles[self.section_name(section_link)] = []
|
||||||
|
subsection_links = self.find_subsection_links(section_link)
|
||||||
|
for subsection_link in subsection_links:
|
||||||
|
sub_name = self.subsection_name(subsection_link)
|
||||||
|
self.subsection_links[sub_name] = []
|
||||||
|
self.parse_subsection(section_link, subsection_link)
|
||||||
|
ans = []
|
||||||
|
for k in self.articles:
|
||||||
|
ans.append((string.capwords(k), self.articles[k]))
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def section_links_from_home_page(self):
|
||||||
|
|
||||||
|
def include_link(link):
|
||||||
|
return self.text(link).lower() in self.sections
|
||||||
|
|
||||||
|
url = 'http://www.hurriyetdailynews.com/'
|
||||||
|
try:
|
||||||
|
self._p('hitting home page ' + url)
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
except:
|
||||||
|
self._p('Unable to spider home page')
|
||||||
|
return []
|
||||||
|
|
||||||
|
self._p('Got home page. hunt down section links.')
|
||||||
|
|
||||||
|
regex = re.compile('rmRootLink', re.IGNORECASE)
|
||||||
|
links = soup.findAll('a', {'class':regex})
|
||||||
|
|
||||||
|
filtered_links = filter(include_link, links)
|
||||||
|
self._p(' all sections: ' + ', '.join(map(self.text, links)))
|
||||||
|
self._p(' filtered sections: ' +
|
||||||
|
', '.join(map(self.text, filtered_links)))
|
||||||
|
|
||||||
|
return filtered_links
|
||||||
|
|
||||||
|
def find_subsection_links(self, section_link):
|
||||||
|
self._p('find subsection links for section ' + str(section_link))
|
||||||
|
url = self.abs_url(section_link['href'])
|
||||||
|
try:
|
||||||
|
self._p('hitting ' + url)
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
except:
|
||||||
|
self._p('Unable to spider subsection')
|
||||||
|
return []
|
||||||
|
self._p('Got ' + url)
|
||||||
|
|
||||||
|
div = soup.find('div', {'class':'SeffafLink'})
|
||||||
|
if not div:
|
||||||
|
self._p('could not find any subsections')
|
||||||
|
return [section_link]
|
||||||
|
links = div.findAll('a')
|
||||||
|
self._p(' subsection links: ' + ', '.join(map(self.text, links)))
|
||||||
|
return links
|
||||||
|
|
||||||
|
def parse_subsection(self, section_link, subsection_link):
|
||||||
|
|
||||||
|
section = self.section_name(section_link)
|
||||||
|
if len(self.articles[section]) > self.max_articles_per_section:
|
||||||
|
return
|
||||||
|
|
||||||
|
# tmp dbg
|
||||||
|
# if not self.subsection_name(subsection_link) == 'arts':
|
||||||
|
# return
|
||||||
|
|
||||||
|
self._p('hit section ' + section +
|
||||||
|
', subsect ' + self.subsection_name(subsection_link))
|
||||||
|
url = self.abs_url(subsection_link['href'])
|
||||||
|
try:
|
||||||
|
self._p('hitting ' + url)
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
except:
|
||||||
|
self._p('Unable to spider section')
|
||||||
|
return []
|
||||||
|
|
||||||
|
self._p('Process links ')
|
||||||
|
for link in soup.findAll('a'):
|
||||||
|
if 'NewsDetail' in str(link.get('id')):
|
||||||
|
self.process_link(section_link, subsection_link, link)
|
||||||
|
|
||||||
|
def process_link(self, section_link, subsection_link, link):
|
||||||
|
section = self.section_name(section_link)
|
||||||
|
subsection = self.subsection_name(subsection_link)
|
||||||
|
title = link['title'] or self.text(link)
|
||||||
|
href = link.get('href')
|
||||||
|
if not href:
|
||||||
|
self._p("BAD HREF: " + str(link))
|
||||||
|
return
|
||||||
|
self.queue_article_link(section, subsection, href, title)
|
||||||
|
|
||||||
|
def queue_article_link(self, section, subsection, url, title):
|
||||||
|
full_url = self.abs_url(url)
|
||||||
|
if full_url in self.urls_done:
|
||||||
|
# self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url]))
|
||||||
|
return
|
||||||
|
|
||||||
|
self.urls_done.append(full_url)
|
||||||
|
if len(self.articles[section]) >= self.max_articles_per_section:
|
||||||
|
return
|
||||||
|
if len(self.subsection_links[subsection]) >= \
|
||||||
|
self.max_articles_per_subsection:
|
||||||
|
return
|
||||||
|
self._p('Q: ' + ' - '.join([section, subsection, title, url]))
|
||||||
|
full_title = string.capwords(subsection + ' - ' + title)
|
||||||
|
self.subsection_links[subsection].append(url)
|
||||||
|
self.articles[section].append(
|
||||||
|
dict(title=full_title,
|
||||||
|
url=full_url,
|
||||||
|
date='',
|
||||||
|
description='',
|
||||||
|
author='',
|
||||||
|
content=''))
|
||||||
|
|
||||||
|
def text(self,n):
|
||||||
|
return self.tag_to_string(n).strip()
|
||||||
|
|
||||||
|
def abs_url(self, url):
|
||||||
|
if 'www.hurriyetdailynews.com' in url:
|
||||||
|
abs_url = url
|
||||||
|
elif url[0] == '/':
|
||||||
|
abs_url = 'http://www.hurriyetdailynews.com' + url
|
||||||
|
else:
|
||||||
|
abs_url = 'http://www.hurriyetdailynews.com/' + url
|
||||||
|
if '#' in abs_url:
|
||||||
|
abs_url = ''.join(abs_url.split('#')[0:-1])
|
||||||
|
|
||||||
|
return abs_url
|
||||||
|
|
||||||
|
def section_name(self,link):
|
||||||
|
return self.text(link).lower()
|
||||||
|
|
||||||
|
def subsection_name(self,link):
|
||||||
|
from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1]
|
||||||
|
return from_fn
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
|
||||||
|
if reason_to_skip:
|
||||||
|
self._p('Skipping article: ' + reason_to_skip + ', ' + url)
|
||||||
|
# Next line will show up as an error in the logs, but ignore, see
|
||||||
|
# http://www.mobileread.com/forums/showthread.php?p=2931136
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return super(self.__class__, self).preprocess_raw_html(raw_html, url)
|
||||||
|
|
||||||
|
def should_skip_article(self, soup):
|
||||||
|
date = self.scrape_article_date(soup)
|
||||||
|
if not date:
|
||||||
|
return False
|
||||||
|
|
||||||
|
age = (datetime.datetime.now() - date).days
|
||||||
|
if (age > self.oldest_article):
|
||||||
|
return "too old"
|
||||||
|
return False
|
||||||
|
|
||||||
|
def date_from_string(self, datestring):
|
||||||
|
try:
|
||||||
|
# eg: September/17/2014
|
||||||
|
dt = datetime.datetime.strptime(datestring,"%B/%d/%Y")
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
# eg: September 17/2014
|
||||||
|
dt = datetime.datetime.strptime(datestring,"%B %d/%Y")
|
||||||
|
except:
|
||||||
|
dt = None
|
||||||
|
if dt:
|
||||||
|
self._p('From string "' + datestring + '", datetime: ' + str(dt))
|
||||||
|
else:
|
||||||
|
self._p('Could not get datetime from ' + datestring)
|
||||||
|
return dt
|
||||||
|
|
||||||
|
def scrape_article_date(self, soup):
|
||||||
|
dnode = soup.find('p', {'class':'dateagency'}) or \
|
||||||
|
soup.find('p', {'class':'Tarih'})
|
||||||
|
if dnode:
|
||||||
|
dstring = self.text(dnode)
|
||||||
|
return self.date_from_string(dstring)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _dbg_soup_node(self, node):
|
||||||
|
s = ' cls: ' + str(node.get('class')).strip() + \
|
||||||
|
' id: ' + str(node.get('id')).strip() + \
|
||||||
|
' txt: ' + self.text(node)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _p(self, msg):
|
||||||
|
curframe = inspect.currentframe()
|
||||||
|
calframe = inspect.getouterframes(curframe, 2)
|
||||||
|
calname = calframe[1][3].upper()
|
||||||
|
print('[' + calname + '] ' + msg[0:120])
|
Loading…
x
Reference in New Issue
Block a user