Improved The Independent

This commit is contained in:
Kovid Goyal 2011-11-12 08:17:47 +05:30
parent 3c3f10dbee
commit 86e5c79180
3 changed files with 232 additions and 45 deletions

View File

@ -8,6 +8,15 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
class TheIndependentNew(BasicNewsRecipe): class TheIndependentNew(BasicNewsRecipe):
# flag to enable/disable article graphics on business pages/some others
# eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
# -max dimensions can be altered using the .pictureContainer img selector in the css
_FETCH_ARTICLE_GRAPHICS = True
#Flag to enable/disable image fetching (not business)
_FETCH_IMAGES = True
#used for converting rating to stars #used for converting rating to stars
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png' _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
_NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png' _NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
@ -35,7 +44,15 @@ class TheIndependentNew(BasicNewsRecipe):
] ]
keep_only_tags =[dict(attrs={'id':'main'})] keep_only_tags =[dict(attrs={'id':'main'})]
recursions = 0
# fixes non compliant html nesting and 'marks' article graphics links
preprocess_regexps = [
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
(re.compile('<strong>.*?Click.*?to view graphic.*?</strong>', re.DOTALL),
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
]
conversion_options = { conversion_options = {
@ -62,14 +79,34 @@ class TheIndependentNew(BasicNewsRecipe):
.column-1 p,a,h1,h2,h3 { margin: 0; } .column-1 p,a,h1,h2,h3 { margin: 0; }
.column-1 div{color:#888888; margin: 0;} .column-1 div{color:#888888; margin: 0;}
.articleContent {display: block; clear:left;} .articleContent {display: block; clear:left;}
.storyTop{}
.pictureContainer img { max-width: 400px; max-height: 400px;}
""" """
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
_processed_urls = []
def get_article_url(self, article):
url = super(self.__class__,self).get_article_url(article)
title = article.get('title', None)
if title and re.search("^Video:",title):
return None
#remove duplicates
if not (url in self._processed_urls):
self._processed_urls.append(url)
else:
url = None
return url
def preprocess_html(self, soup): def preprocess_html(self, soup):
items_to_extract = []
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}): for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
remove = True remove = True
pattern = re.compile('((articleContent)|(title))$') pattern = re.compile('((articleContent)|(title))$')
@ -85,7 +122,10 @@ class TheIndependentNew(BasicNewsRecipe):
#images #images
pattern = re.compile('slideshow') pattern = re.compile('slideshow')
if (pattern.search(item['class'])) is not None: if (pattern.search(item['class'])) is not None:
remove = False if self._FETCH_IMAGES:
remove = False
else:
remove = True
#social widgets always bad #social widgets always bad
pattern = re.compile('socialwidget') pattern = re.compile('socialwidget')
@ -93,30 +133,36 @@ class TheIndependentNew(BasicNewsRecipe):
remove = True remove = True
if remove: if remove:
item.extract() items_to_extract.append(item)
for item in soup.findAll('a',attrs={'href' : re.compile('.*')}): for item in items_to_extract:
if item.img is not None: item.extract()
#use full size image
img = item.findNext('img') items_to_extract = []
img['src'] = item['href'] if self._FETCH_IMAGES:
for item in soup.findAll('a',attrs={'href' : re.compile('.*')}):
#insert caption if available if item.img is not None:
if img['title'] is not None and (len(img['title']) > 1): #use full size image
tag = Tag(soup,'h3') img = item.findNext('img')
text = NavigableString(img['title'])
tag.insert(0,text) img['src'] = item['href']
#picture before text #insert caption if available
img.extract() if img['title'] is not None and (len(img['title']) > 1):
item.insert(0,img) tag = Tag(soup,'h3')
item.insert(1,tag) text = NavigableString(img['title'])
tag.insert(0,text)
# remove link
item.name = "div" #picture before text
item["class"]='image' img.extract()
del item["href"] item.insert(0,img)
item.insert(1,tag)
# remove link
item.name = "div"
item["class"]='image'
del item["href"]
#remove empty subtitles #remove empty subtitles
@ -127,13 +173,12 @@ class TheIndependentNew(BasicNewsRecipe):
""" """
subtitle = soup.find('h3',attrs={'class' : 'subtitle'}) subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
if subtitle is not None: if subtitle is not None:
subtitleText = subtitle.findNext('p') subtitleText = subtitle.findNext('p')
if subtitleText is not None: if subtitleText is not None:
if len(subtitleText.contents[0]) <= 1 : if len(subtitleText.contents[0]) <= 1 :
subtitleText.extract() subtitleText.extract()
subtitle.extract() subtitle.extract()
#replace rating numbers with stars #replace rating numbers with stars
for item in soup.findAll('div',attrs={ 'class' : 'starRating'}): for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
@ -141,10 +186,64 @@ class TheIndependentNew(BasicNewsRecipe):
soup2 = self._insertRatingStars(soup,item) soup2 = self._insertRatingStars(soup,item)
if soup2 is not None: if soup2 is not None:
soup = soup2 soup = soup2
#remove empty paragraph tags in storyTop which can leave a space
#between first paragraph and rest of story
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
for item in storyTop.findAll('p'):
if item.contents is not None and len(item.contents[0]) <= 1 :
items_to_extract.append(item)
for item in items_to_extract:
item.extract()
items_to_extract = []
#remove line breaks immediately next to tags with default margins
#to prevent double line spacing and narrow columns of text
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
self._remove_undesired_line_breaks_from_tag(storyTop,soup)
#replace article graphics link with the graphics themselves
if self._FETCH_ARTICLE_GRAPHICS:
items_to_insert = []
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
strong = item.find('strong')
for child in strong:
if isinstance(child,Tag):
if str(child.name) == 'a':
items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
for item in items_to_insert:
item[0].replaceWith(item[1])
for item in items_to_extract:
item.extract()
return soup return soup
def _get_article_graphic(self,old_item,url,soup):
items_to_insert = []
if re.search('\.jpg$',str(url)):
div = Tag(soup,'div')
div['class'] = 'pictureContainer'
img = Tag(soup,'img')
img['src'] = url
img['alt'] = 'article graphic'
div.insert(0,img)
items_to_insert.append((old_item,div,))
return items_to_insert
soup2 = self.index_to_soup(url)
for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
items_to_insert.append((old_item,item),)
return items_to_insert
def _insertRatingStars(self,soup,item): def _insertRatingStars(self,soup,item):
@ -167,6 +266,7 @@ class TheIndependentNew(BasicNewsRecipe):
def postprocess_html(self,soup, first_fetch): def postprocess_html(self,soup, first_fetch):
#find broken images and remove captions #find broken images and remove captions
items_to_extract = []
for item in soup.findAll('div', attrs={'class' : 'image'}): for item in soup.findAll('div', attrs={'class' : 'image'}):
img = item.findNext('img') img = item.findNext('img')
if img is not None and img['src'] is not None: if img is not None and img['src'] is not None:
@ -175,20 +275,114 @@ class TheIndependentNew(BasicNewsRecipe):
if pattern.match(img["src"]) is not None: if pattern.match(img["src"]) is not None:
caption = img.findNextSibling('h3') caption = img.findNextSibling('h3')
if caption is not None: if caption is not None:
caption.extract() items_to_extract.append(caption)
img.extract() items_to_extract.append(img)
return soup
for item in items_to_extract:
item.extract()
return soup
def _recurisvely_linearise_tag_tree(
self,
item,
linearised= None,
count=0,
limit = 100
):
linearised = linearised or []
count = count + 1
if count > limit:
return linearised
if not (isinstance(item,Tag)):
return linearised
for nested in item:
linearised.append(nested)
linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
return linearised
def _get_previous_tag(self,current_index, tag_tree):
if current_index == 0:
return None
else:
return tag_tree[current_index - 1]
def _get_next_tag(self,current_index, tag_tree):
if current_index < len(tag_tree) - 1:
return tag_tree[current_index + 1]
else:
return None
def _list_match(self,test_str, list_regex):
for regex in list_regex:
match = re.match(regex, test_str)
if match is not None:
return True
return False
def _remove_undesired_line_breaks_from_tag(self,parent,soup):
if parent is None:
return
tag_tree = self._recurisvely_linearise_tag_tree(parent)
items_to_remove = []
for item in tag_tree:
if item == u'\n':
items_to_remove.append(item)
continue;
for item in items_to_remove:
tag_tree.remove(item)
spaced_tags = [r'p', r'h\d', r'blockquote']
tags_to_extract = []
tags_to_replace = []
for (i, tag) in enumerate(tag_tree):
if isinstance(tag, Tag):
if str(tag) == '<br />':
previous_tag = self._get_previous_tag(i, tag_tree)
if isinstance(previous_tag, Tag):
previous_tag_is_spaced = previous_tag is not None\
and self._list_match(str(previous_tag.name),
spaced_tags)
else:
previous_tag_is_spaced = False
next_tag = self._get_next_tag(i, tag_tree)
if isinstance(next_tag, Tag):
next_tag_is_spaced = next_tag is not None\
and self._list_match(str(next_tag.name), spaced_tags)
else:
next_tag_is_spaced = False
if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
or i == len(tag_tree) - 1:
tags_to_extract.append(tag)
else:
tags_to_replace.append((tag,NavigableString(' '),))
for pair in tags_to_replace:
pair[0].replaceWith(pair[1])
for tag in tags_to_extract:
tag.extract()
feeds = [ feeds = [
(u'News - UK', (u'News - UK',
u'http://www.independent.co.uk/news/uk/?service=rss'), u'http://www.independent.co.uk/news/uk/?service=rss'),
(u'News - World', (u'News - World',
u'http://www.independent.co.uk/news/world/?service=rss'), u'http://www.independent.co.uk/news/world/?service=rss'),
(u'News - Business', (u'News - Business',
u'http://www.independent.co.uk/news/business/?service=rss'), u'http://www.independent.co.uk/news/business/?service=rss'),
(u'News - People', (u'News - People',
u'http://www.independent.co.uk/news/people/?service=rss'), u'http://www.independent.co.uk/news/people/?service=rss'),
(u'News - Science', (u'News - Science',
@ -289,6 +483,3 @@ class TheIndependentNew(BasicNewsRecipe):
u'http://www.independent.co.uk/extras/indybest/?service=rss'), u'http://www.independent.co.uk/extras/indybest/?service=rss'),
] ]

View File

@ -136,7 +136,7 @@ the previously checked out |app| code directory, for example::
cd /Users/kovid/work/calibre cd /Users/kovid/work/calibre
calibre is the directory that contains the src and resources sub-directories. Ensure you have installed the |app| commandline tools via :guilabel:Preferences->Advanced->Miscellaneous in the |app| GUI. calibre is the directory that contains the src and resources sub-directories. Ensure you have installed the |app| commandline tools via :guilabel:`Preferences->Advanced->Miscellaneous` in the |app| GUI.
The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory. The next step is to set the environment variable ``CALIBRE_DEVELOP_FROM`` to the absolute path of the src directory.
So, following the example above, it would be ``/Users/kovid/work/calibre/src``. Apple So, following the example above, it would be ``/Users/kovid/work/calibre/src``. Apple

View File

@ -242,10 +242,6 @@ Replace ``192.168.1.2`` with the local IP address of the computer running |app|.
If you get timeout errors while browsing the calibre catalog in Stanza, try increasing the connection timeout value in the stanza settings. Go to Info->Settings and increase the value of Download Timeout. If you get timeout errors while browsing the calibre catalog in Stanza, try increasing the connection timeout value in the stanza settings. Go to Info->Settings and increase the value of Download Timeout.
.. note::
As of iOS version 5 Stanza no longer works on Apple devices. Alternatives to Stanza are discussed `in this forum <http://www.mobileread.com/forums/showthread.php?t=152789>`_.
Using iBooks Using iBooks
************** **************