mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 10:37:00 -04:00 
			
		
		
		
	Hurriyet Daily News by spswerling
This commit is contained in:
		
							parent
							
								
									2b23b0d342
								
							
						
					
					
						commit
						92cfab55a1
					
				
							
								
								
									
										260
									
								
								recipes/hurriyet_daily_news.recipe
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										260
									
								
								recipes/hurriyet_daily_news.recipe
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,260 @@ | |||||||
|  | #!/usr/bin/env  python | ||||||
|  | # -*- coding: utf-8 -*- | ||||||
|  | __license__   = 'GPL v3' | ||||||
|  | __copyright__ = '2014, spswerling' | ||||||
|  | ''' | ||||||
|  | www.hurriyetdailynews.com | ||||||
|  | ''' | ||||||
|  | import os, string, inspect, datetime, re | ||||||
|  | from calibre.web.feeds.news import BasicNewsRecipe | ||||||
|  | from calibre.ebooks.BeautifulSoup import BeautifulSoup | ||||||
|  | 
 | ||||||
|  | class HurriyetDailyNews_en(BasicNewsRecipe): | ||||||
|  |     title          = u'Hurriyet Daily News' | ||||||
|  |     __author__            = u'spswerling' | ||||||
|  |     description            = 'a Turkey based daily in english' | ||||||
|  |     description = 'English version of Turkish Daily "Hurriyet"' | ||||||
|  |     no_stylesheets         = True | ||||||
|  |     encoding               = 'utf-8' | ||||||
|  |     category               = 'news' | ||||||
|  |     language               = 'en_TR' | ||||||
|  |     publication_type = 'newspaper' | ||||||
|  |     cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png' | ||||||
|  |     masthead_url = cover_img_url | ||||||
|  |     remove_empty_feeds = True | ||||||
|  | 
 | ||||||
|  |     # on kindle, images can make things kind of fat. Slim them down. | ||||||
|  |     recursions = 0 | ||||||
|  |     oldest_article = 1 | ||||||
|  |     compress_news_images = True | ||||||
|  |     compress_news_images_max_size = 7 | ||||||
|  |     scale_news_images = (150,200)  # (kindle touch: 600x800) | ||||||
|  |     useHighResImages = False | ||||||
|  |     oldest_article = 1.5 | ||||||
|  |     max_articles_per_section = 25 | ||||||
|  |     max_articles_per_subsection = 7 | ||||||
|  | 
 | ||||||
|  |     sections = [ | ||||||
|  |                  u'turkey', | ||||||
|  |                  u'economy', | ||||||
|  |                  u'world', | ||||||
|  |                  u'sports', | ||||||
|  |                  # u'life', | ||||||
|  |                  u'opinion', | ||||||
|  |                  # u'arts/culture' | ||||||
|  |                ] | ||||||
|  | 
 | ||||||
|  |     # util for creating remove_tags and keep_tags style regex matchers | ||||||
|  |     def tag_matcher(elt, attr, str): | ||||||
|  |         return dict(name=elt, attrs={attr:re.compile(str, re.IGNORECASE)}) | ||||||
|  | 
 | ||||||
|  |     keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')] | ||||||
|  | 
 | ||||||
|  |     remove_tags = [ | ||||||
|  |         tag_matcher('div', 'class', 'Carousel'), | ||||||
|  |         tag_matcher('div', 'class', 'ShareIt'), | ||||||
|  |         tag_matcher('div', 'class', 'tmz'), | ||||||
|  |         tag_matcher('span', 'id', 'comment'), | ||||||
|  |         tag_matcher('h2', 'class', 'NewSpot'), | ||||||
|  |         tag_matcher('h2', 'class', 'pv-gallery'), | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |     articles = {} | ||||||
|  |     subsection_links = {} | ||||||
|  |     urls_done = [] | ||||||
|  |     links_per_section = {} | ||||||
|  | 
 | ||||||
|  |     def parse_index(self): | ||||||
|  |         section_links = self.section_links_from_home_page() | ||||||
|  |         for section_link in section_links: | ||||||
|  |             self.articles[self.section_name(section_link)] = [] | ||||||
|  |             subsection_links = self.find_subsection_links(section_link) | ||||||
|  |             for subsection_link in subsection_links: | ||||||
|  |                 sub_name = self.subsection_name(subsection_link) | ||||||
|  |                 self.subsection_links[sub_name] = [] | ||||||
|  |                 self.parse_subsection(section_link, subsection_link) | ||||||
|  |         ans = [] | ||||||
|  |         for k in self.articles: | ||||||
|  |             ans.append((string.capwords(k), self.articles[k])) | ||||||
|  |         return ans | ||||||
|  | 
 | ||||||
|  |     def section_links_from_home_page(self): | ||||||
|  | 
 | ||||||
|  |         def include_link(link): | ||||||
|  |             return self.text(link).lower() in self.sections | ||||||
|  | 
 | ||||||
|  |         url = 'http://www.hurriyetdailynews.com/' | ||||||
|  |         try: | ||||||
|  |             self._p('hitting home page ' + url) | ||||||
|  |             soup = self.index_to_soup(url) | ||||||
|  |         except: | ||||||
|  |             self._p('Unable to spider home page') | ||||||
|  |             return [] | ||||||
|  | 
 | ||||||
|  |         self._p('Got home page. hunt down section links.') | ||||||
|  | 
 | ||||||
|  |         regex = re.compile('rmRootLink', re.IGNORECASE) | ||||||
|  |         links = soup.findAll('a', {'class':regex}) | ||||||
|  | 
 | ||||||
|  |         filtered_links = filter(include_link, links) | ||||||
|  |         self._p(' all sections: ' + ', '.join(map(self.text, links))) | ||||||
|  |         self._p(' filtered sections: ' + | ||||||
|  |                 ', '.join(map(self.text, filtered_links))) | ||||||
|  | 
 | ||||||
|  |         return filtered_links | ||||||
|  | 
 | ||||||
|  |     def find_subsection_links(self, section_link): | ||||||
|  |         self._p('find subsection links for section  ' + str(section_link)) | ||||||
|  |         url = self.abs_url(section_link['href']) | ||||||
|  |         try: | ||||||
|  |             self._p('hitting ' + url) | ||||||
|  |             soup = self.index_to_soup(url) | ||||||
|  |         except: | ||||||
|  |             self._p('Unable to spider subsection') | ||||||
|  |             return [] | ||||||
|  |         self._p('Got ' + url) | ||||||
|  | 
 | ||||||
|  |         div = soup.find('div', {'class':'SeffafLink'}) | ||||||
|  |         if not div: | ||||||
|  |             self._p('could not find any subsections') | ||||||
|  |             return [section_link] | ||||||
|  |         links = div.findAll('a') | ||||||
|  |         self._p(' subsection links: ' + ', '.join(map(self.text, links))) | ||||||
|  |         return links | ||||||
|  | 
 | ||||||
|  |     def parse_subsection(self, section_link, subsection_link): | ||||||
|  | 
 | ||||||
|  |         section = self.section_name(section_link) | ||||||
|  |         if len(self.articles[section]) > self.max_articles_per_section: | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         # tmp dbg | ||||||
|  |         # if not self.subsection_name(subsection_link) == 'arts': | ||||||
|  |         #    return | ||||||
|  | 
 | ||||||
|  |         self._p('hit section  ' + section + | ||||||
|  |            ', subsect ' +  self.subsection_name(subsection_link)) | ||||||
|  |         url = self.abs_url(subsection_link['href']) | ||||||
|  |         try: | ||||||
|  |             self._p('hitting ' + url) | ||||||
|  |             soup = self.index_to_soup(url) | ||||||
|  |         except: | ||||||
|  |             self._p('Unable to spider section') | ||||||
|  |             return [] | ||||||
|  | 
 | ||||||
|  |         self._p('Process  links ') | ||||||
|  |         for link in soup.findAll('a'): | ||||||
|  |             if 'NewsDetail' in str(link.get('id')): | ||||||
|  |                 self.process_link(section_link, subsection_link, link) | ||||||
|  | 
 | ||||||
|  |     def process_link(self, section_link, subsection_link, link): | ||||||
|  |         section = self.section_name(section_link) | ||||||
|  |         subsection = self.subsection_name(subsection_link) | ||||||
|  |         title = link['title'] or self.text(link) | ||||||
|  |         href = link.get('href') | ||||||
|  |         if not href: | ||||||
|  |             self._p("BAD HREF: " + str(link)) | ||||||
|  |             return | ||||||
|  |         self.queue_article_link(section, subsection, href, title) | ||||||
|  | 
 | ||||||
|  |     def queue_article_link(self, section, subsection, url, title): | ||||||
|  |         full_url = self.abs_url(url) | ||||||
|  |         if full_url in self.urls_done: | ||||||
|  |             # self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url])) | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         self.urls_done.append(full_url) | ||||||
|  |         if len(self.articles[section]) >= self.max_articles_per_section: | ||||||
|  |             return | ||||||
|  |         if len(self.subsection_links[subsection]) >= \ | ||||||
|  |             self.max_articles_per_subsection: | ||||||
|  |             return | ||||||
|  |         self._p('Q: ' + ' - '.join([section, subsection, title, url])) | ||||||
|  |         full_title = string.capwords(subsection + ' - ' + title) | ||||||
|  |         self.subsection_links[subsection].append(url) | ||||||
|  |         self.articles[section].append( | ||||||
|  |                         dict(title=full_title, | ||||||
|  |                             url=full_url, | ||||||
|  |                             date='', | ||||||
|  |                             description='', | ||||||
|  |                             author='', | ||||||
|  |                             content='')) | ||||||
|  | 
 | ||||||
|  |     def text(self,n): | ||||||
|  |         return self.tag_to_string(n).strip() | ||||||
|  | 
 | ||||||
|  |     def abs_url(self, url): | ||||||
|  |         if 'www.hurriyetdailynews.com' in url: | ||||||
|  |             abs_url = url | ||||||
|  |         elif url[0] == '/': | ||||||
|  |             abs_url = 'http://www.hurriyetdailynews.com' + url | ||||||
|  |         else: | ||||||
|  |             abs_url = 'http://www.hurriyetdailynews.com/' + url | ||||||
|  |         if '#' in abs_url: | ||||||
|  |             abs_url = ''.join(abs_url.split('#')[0:-1]) | ||||||
|  | 
 | ||||||
|  |         return abs_url | ||||||
|  | 
 | ||||||
|  |     def section_name(self,link): | ||||||
|  |         return self.text(link).lower() | ||||||
|  | 
 | ||||||
|  |     def subsection_name(self,link): | ||||||
|  |         from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1] | ||||||
|  |         return from_fn | ||||||
|  | 
 | ||||||
|  |     def preprocess_raw_html(self, raw_html, url): | ||||||
|  |         reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html)) | ||||||
|  |         if reason_to_skip: | ||||||
|  |             self._p('Skipping article: ' + reason_to_skip + ', ' + url) | ||||||
|  |             # Next line will show up as an error in the logs, but ignore, see | ||||||
|  |             #   http://www.mobileread.com/forums/showthread.php?p=2931136 | ||||||
|  |             return None | ||||||
|  |         else: | ||||||
|  |             return super(self.__class__, self).preprocess_raw_html(raw_html, url) | ||||||
|  | 
 | ||||||
|  |     def should_skip_article(self, soup): | ||||||
|  |         date = self.scrape_article_date(soup) | ||||||
|  |         if not date: | ||||||
|  |             return False | ||||||
|  | 
 | ||||||
|  |         age = (datetime.datetime.now() - date).days | ||||||
|  |         if (age > self.oldest_article): | ||||||
|  |             return "too old" | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  |     def date_from_string(self, datestring): | ||||||
|  |         try: | ||||||
|  |             # eg: September/17/2014 | ||||||
|  |             dt = datetime.datetime.strptime(datestring,"%B/%d/%Y") | ||||||
|  |         except: | ||||||
|  |             try: | ||||||
|  |                 # eg: September 17/2014 | ||||||
|  |                 dt = datetime.datetime.strptime(datestring,"%B %d/%Y") | ||||||
|  |             except: | ||||||
|  |                 dt = None | ||||||
|  |         if dt: | ||||||
|  |             self._p('From string "' + datestring + '", datetime: ' + str(dt)) | ||||||
|  |         else: | ||||||
|  |             self._p('Could not get datetime from ' + datestring) | ||||||
|  |         return dt | ||||||
|  | 
 | ||||||
|  |     def scrape_article_date(self, soup): | ||||||
|  |         dnode =  soup.find('p', {'class':'dateagency'}) or \ | ||||||
|  |                  soup.find('p', {'class':'Tarih'}) | ||||||
|  |         if dnode: | ||||||
|  |             dstring = self.text(dnode) | ||||||
|  |             return self.date_from_string(dstring) | ||||||
|  |         else: | ||||||
|  |             return None | ||||||
|  | 
 | ||||||
|  |     def _dbg_soup_node(self, node): | ||||||
|  |         s = '   cls: ' + str(node.get('class')).strip() + \ | ||||||
|  |               '  id: ' + str(node.get('id')).strip() + \ | ||||||
|  |               ' txt: ' + self.text(node) | ||||||
|  |         return s | ||||||
|  | 
 | ||||||
|  |     def _p(self, msg): | ||||||
|  |         curframe = inspect.currentframe() | ||||||
|  |         calframe = inspect.getouterframes(curframe, 2) | ||||||
|  |         calname = calframe[1][3].upper() | ||||||
|  |         print('[' + calname + '] ' + msg[0:120]) | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user