diff --git a/recipes/foreignaffairs.recipe b/recipes/foreignaffairs.recipe
index 69511cbd09..c7fa21b3e9 100644
--- a/recipes/foreignaffairs.recipe
+++ b/recipes/foreignaffairs.recipe
@@ -3,10 +3,17 @@ import re
from calibre.ptempfile import PersistentTemporaryFile
class ForeignAffairsRecipe(BasicNewsRecipe):
+ ''' there are three modifications:
+ 1) fetch issue cover
+ 2) toggle ignore premium articles
+ 3) extract proper section names, ie. "Comments", "Essay"
+
+ by Chen Wei weichen302@gmx.com, 2012-02-05'''
+
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en'
- version = 1
+ version = 1.01
title = u'Foreign Affairs (Subcription or (free) Registration)'
publisher = u'Council on Foreign Relations'
@@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
remove_javascript = True
INDEX = 'http://www.foreignaffairs.com'
+ FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
+ INCLUDE_PREMIUM = False
+
remove_tags = []
remove_tags.append(dict(name = 'base'))
@@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
temp_files = []
articles_are_obfuscated = True
+ def get_cover_url(self):
+ soup = self.index_to_soup(self.FRONTPAGE)
+ div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
+ img_url = div.find('img')['src']
+ return self.INDEX + img_url
+
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
@@ -50,57 +66,47 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return self.temp_files[-1].name
+
def parse_index(self):
- soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
- articles = []
answer = []
- content = soup.find('div', attrs = {'class': 'center-wrapper'})
- if content:
- for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
- tag = div.find('div', attrs = {'class': 'views-field-title'})
- if tag:
- a = tag.find('a')
- if a:
- title = self.tag_to_string(a)
- url = self.INDEX + a['href']
-
- author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
- tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
- # If they ever fix their markup, this will break :-(
- summary = self.tag_to_string(tag.findNextSibling('p'))
- description = author + '
' + summary
-
- articles.append({'title': title, 'date': None, 'url': url, 'description': description})
- else:
- continue
- else:
- continue
-
- answer.append(('Magazine', articles))
-
- ul = content.find('ul')
- if ul:
+ soup = self.index_to_soup(self.FRONTPAGE)
+ sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
+ for sec in sec_start:
+ content = sec.nextSibling
+ if content:
+ section = self.tag_to_string(content.find('h2'))
articles = []
- for li in ul.findAll('li'):
- tag = li.find('div', attrs = {'class': 'views-field-title'})
- if tag:
- a = tag.find('a')
- if a:
- title = self.tag_to_string(a)
- url = self.INDEX + a['href']
- description = ''
- tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
- if tag:
- description = self.tag_to_string(tag)
- articles.append({'title': title, 'date': None, 'url': url, 'description': description})
- else:
- continue
+ tags = []
+ for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
+ tags.append(div)
+ ul = content.find('ul')
+ for li in content.findAll('li'):
+ tags.append(li)
+
+ for div in tags:
+ title = url = description = author = None
+
+ if self.INCLUDE_PREMIUM:
+ found_premium = False
else:
- continue
-
- answer.append(('Letters to the Editor', articles))
+ found_premium = div.findAll('span', attrs={'class':
+ 'premium-icon'})
+ if not found_premium:
+ tag = div.find('div', attrs={'class': 'views-field-title'})
+ if tag:
+ a = tag.find('a')
+ if a:
+ title = self.tag_to_string(a)
+ url = self.INDEX + a['href']
+ author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
+ tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
+ description = self.tag_to_string(tag_summary)
+ articles.append({'title':title, 'date':None, 'url':url,
+ 'description':description, 'author':author})
+ if articles:
+ answer.append((section, articles))
return answer
def preprocess_html(self, soup):
diff --git a/recipes/ilmanifesto.recipe b/recipes/ilmanifesto.recipe
new file mode 100644
index 0000000000..d7428cebb2
--- /dev/null
+++ b/recipes/ilmanifesto.recipe
@@ -0,0 +1,110 @@
+from calibre import strftime
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+MANIFESTO_BASEURL = 'http://www.ilmanifesto.it/'
+
+class IlManifesto(BasicNewsRecipe):
+ title = 'Il Manifesto'
+ __author__ = 'Giacomo Lacava'
+ description = 'quotidiano comunista - ultima edizione html disponibile'
+ publication_type = 'newspaper'
+ publisher = 'il manifesto coop. editrice a r.l.'
+ language = 'it'
+
+ oldest_article = 2
+ max_articles_per_feed = 100
+ delay = 1
+ no_stylesheets = True
+ simultaneous_downloads = 5
+ timeout = 30
+ auto_cleanup = True
+ remove_tags = [dict(name='div', attrs={'class':'column_1 float_left'})]
+ remove_tags_before = dict(name='div',attrs={'class':'column_2 float_right'})
+ remove_tags_after = dict(id='myPrintArea')
+
+ manifesto_index = None
+ manifesto_datestr = None
+
+ def _set_manifesto_index(self):
+ if self.manifesto_index == None:
+ startUrl = MANIFESTO_BASEURL + 'area-abbonati/in-edicola/'
+ startSoup = self.index_to_soup(startUrl)
+ lastEdition = startSoup.findAll('div',id='accordion_inedicola')[1].find('a')['href']
+ del(startSoup)
+ self.manifesto_index = MANIFESTO_BASEURL + lastEdition
+ urlsplit = lastEdition.split('/')
+ self.manifesto_datestr = urlsplit[-1]
+ if urlsplit[-1] == '':
+ self.manifesto_datestr = urlsplit[-2]
+
+
+
+ def get_cover_url(self):
+ self._set_manifesto_index()
+ url = MANIFESTO_BASEURL + 'fileadmin/archivi/in_edicola/%sprimapagina.gif' % self.manifesto_datestr
+ return url
+
+ def parse_index(self):
+ self._set_manifesto_index()
+ soup = self.index_to_soup(self.manifesto_index)
+ feedLinks = soup.find('div',id='accordion_inedicola').findAll('a')
+ result = []
+ for feed in feedLinks:
+ articles = []
+ feedName = feed.find('h2').string
+ feedUrl = MANIFESTO_BASEURL + feed['href']
+ feedSoup = self.index_to_soup(feedUrl)
+ indexRoot = feedSoup.find('div',attrs={'class':'column1'})
+ for div in indexRoot.findAll('div',attrs={'class':'strumenti1_inedicola'}):
+ artLink = div.find('a')
+ if artLink is None: continue # empty div
+ title = artLink.string
+ url = MANIFESTO_BASEURL + artLink['href']
+
+ description = ''
+ descNode = div.find('div',attrs={'class':'text_12'})
+ if descNode is not None:
+ description = descNode.string
+
+ author = ''
+ authNode = div.find('div',attrs={'class':'firma'})
+ if authNode is not None:
+ author = authNode.string
+
+ articleText = ''
+ article = {
+ 'title':title,
+ 'url':url,
+ 'date': strftime('%d %B %Y'),
+ 'description': description,
+ 'content': articleText,
+ 'author': author
+ }
+ articles.append(article)
+ result.append((feedName,articles))
+ return result
+
+
+ def extract_readable_article(self, html, url):
+
+ bs = BeautifulSoup(html)
+ col1 = bs.find('div',attrs={'class':'column1'})
+
+ content = col1.find('div',attrs={'class':'bodytext'})
+ title = bs.find(id='titolo_articolo').string
+ author = col1.find('span',attrs={'class':'firma'})
+ subtitle = ''
+ subNode = col1.findPrevious('div',attrs={'class':'occhiello_rosso'})
+ if subNode is not None:
+ subtitle = subNode
+ summary = ''
+ sommNode = bs.find('div',attrs={'class':'sommario'})
+ if sommNode is not None:
+ summary = sommNode
+
+ template = "
\u00a0
\n'.encode('utf-8'), res) + f.write(res) + self.write_inline_css(inline_class, border_styles) + stream.seek(0) + mi = get_metadata(stream, 'rtf') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + opf = OPFCreator(os.getcwd(), mi) + opf.create_manifest([('index.xhtml', None)]) + opf.create_spine(['index.xhtml']) + opf.render(open('metadata.opf', 'wb')) + return os.path.abspath('metadata.opf') + + diff --git a/src/calibre/ebooks/rtf/output.py b/src/calibre/ebooks/conversion/plugins/rtf_output.py similarity index 94% rename from src/calibre/ebooks/rtf/output.py rename to src/calibre/ebooks/conversion/plugins/rtf_output.py index 5738b7e6f4..ae9e1ea566 100644 --- a/src/calibre/ebooks/rtf/output.py +++ b/src/calibre/ebooks/conversion/plugins/rtf_output.py @@ -6,7 +6,6 @@ __docformat__ = 'restructuredtext en' import os -from calibre.ebooks.rtf.rtfml import RTFMLizer from calibre.customize.conversion import OutputFormatPlugin class RTFOutput(OutputFormatPlugin): @@ -16,6 +15,8 @@ class RTFOutput(OutputFormatPlugin): file_type = 'rtf' def convert(self, oeb_book, output_path, input_plugin, opts, log): + from calibre.ebooks.rtf.rtfml import RTFMLizer + rtfmlitzer = RTFMLizer(log) content = rtfmlitzer.extract_content(oeb_book, opts) diff --git a/src/calibre/ebooks/snb/input.py b/src/calibre/ebooks/conversion/plugins/snb_input.py similarity index 97% rename from src/calibre/ebooks/snb/input.py rename to src/calibre/ebooks/conversion/plugins/snb_input.py index 13b1ca45f9..ae3ab0033c 100755 --- a/src/calibre/ebooks/snb/input.py +++ b/src/calibre/ebooks/conversion/plugins/snb_input.py @@ -4,13 +4,11 @@ __license__ = 'GPL 3' __copyright__ = '2010, Li Fanxi\u00a0
\n'.encode('utf-8'), res) - f.write(res) - self.write_inline_css(inline_class, border_styles) - stream.seek(0) - mi = get_metadata(stream, 'rtf') - if not mi.title: - mi.title = _('Unknown') - if not mi.authors: - mi.authors = [_('Unknown')] - opf = OPFCreator(os.getcwd(), mi) - opf.create_manifest([('index.xhtml', None)]) - opf.create_spine(['index.xhtml']) - opf.render(open('metadata.opf', 'wb')) - return os.path.abspath('metadata.opf') - -#ebook-convert "bad.rtf" test.epub -v -d "E:\Mes eBooks\Developpement\debug" -# os.makedirs("E:\\Mes eBooks\\Developpement\\rtfdebug") -# debug_dir = "E:\\Mes eBooks\\Developpement\\rtfdebug" diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 4cff648fa5..0880eca4ca 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -16,7 +16,7 @@ from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.conversion.preprocess import DocAnalysis from calibre.utils.cleantext import clean_ascii_chars -HTML_TEMPLATE = u' tags. It condense and retains blank lines when necessary.
-
+
Requires paragraphs to be in single line format.
'''
txt = clean_txt(txt)
@@ -215,7 +215,7 @@ def detect_paragraph_type(txt):
def detect_formatting_type(txt):
'''
Tries to determine the formatting of the document.
-
+
markdown: Markdown formatting is used.
textile: Textile formatting is used.
heuristic: When none of the above formatting types are
diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py
index 35cc249acb..b3e128af82 100644
--- a/src/calibre/gui2/__init__.py
+++ b/src/calibre/gui2/__init__.py
@@ -102,6 +102,7 @@ gprefs.defaults['cb_fullscreen'] = False
gprefs.defaults['worker_max_time'] = 0
gprefs.defaults['show_files_after_save'] = True
gprefs.defaults['auto_add_path'] = None
+gprefs.defaults['auto_add_check_for_duplicates'] = False
# }}}
NONE = QVariant() #: Null value to return from the data function of item models
diff --git a/src/calibre/gui2/add.py b/src/calibre/gui2/add.py
index 7cdac3b845..972ea57cb9 100644
--- a/src/calibre/gui2/add.py
+++ b/src/calibre/gui2/add.py
@@ -382,7 +382,8 @@ class Adder(QObject): # {{{
if not duplicates:
return self.duplicates_processed()
self.pd.hide()
- files = [x[0].title for x in duplicates]
+ files = [_('%s by %s')%(x[0].title, x[0].format_field('authors')[1])
+ for x in duplicates]
if question_dialog(self._parent, _('Duplicates found!'),
_('Books with the same title as the following already '
'exist in the database. Add them anyway?'),
diff --git a/src/calibre/gui2/auto_add.py b/src/calibre/gui2/auto_add.py
index 71d2b8ecd0..6860f386d6 100644
--- a/src/calibre/gui2/auto_add.py
+++ b/src/calibre/gui2/auto_add.py
@@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal