merge from trunk

2025-07-09 03:04:10 -04:00 · 2011-02-07 20:17:08 +08:00 · 2011-02-07 20:17:08 +08:00 · e77e42d1a9
commit e77e42d1a9
parent 8c9c5d35e4 5b0d4f1f10
9 changed files with 226 additions and 24 deletions
--- a/resources/recipes/europa_press.recipe
+++ b/resources/recipes/europa_press.recipe
@ -0,0 +1,55 @@
+__license__   = 'GPL v3'
+__author__    = 'Luis Hernandez'
+__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
+__version__     = 'v1.0'
+__date__        = '30 January 2011'
+
+'''
+www.europapress.es
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1294946868(BasicNewsRecipe):
+
+    title          = u'Europa Press'
+    author            = 'Luis Hernandez'
+    description           = 'spanish news agency'
+
+    oldest_article = 2
+    max_articles_per_feed = 100
+
+    remove_javascript = True
+    no_stylesheets        = True
+    use_embedded_content  = False
+
+    language              = 'es'
+    timefmt        = '[%a, %d %b, %Y]'
+
+    remove_tags_before = dict(name='div' , attrs={'class':['nivel1 bg_3col']})
+    remove_tags_after  = dict(name='div' , attrs={'id':['ImprimirEnviarNoticia']})
+
+    remove_tags = [
+                             dict(name='ul', attrs={'id':['entidadesNoticia','MenuSecciones']})
+                            ,dict(name='div', attrs={'id':['ImprimirEnviarNoticia','PublicidadSuperior','CabeceraDerecha','Comentarios','comentarios full fbConnectAPI','ComentarEstaNoticia','ctl00_Superior_Main_MasEnChance_cajamasnoticias','gl_chn','videos_portada_derecha','galeria_portada_central','galeria_portada_central_boxes']})
+                            ,dict(name='div', attrs={'class':['infoRelacionada','col_1','buscador','caja doblecolumna strong','CHANCE_EP_Encuesta_frontal text','seccionportada col_0','seccion header','text','pie caption_over']})
+                            ,dict(name='a', attrs={'class':['buscadorLabel']})
+                            ,dict(name='span', attrs={'class':['editado']})
+                            ,dict(name='table')
+                            ,dict(name='li')
+                        ]
+
+
+    feeds = [
+                 (u'Portada'              , u'http://www.europapress.es/rss/rss.aspx')
+                ,(u'Nacional'             , u'http://www.europapress.es/rss/rss.aspx?ch=66')
+                ,(u'Internacional'       , u'http://www.europapress.es/rss/rss.aspx?ch=69')
+                ,(u'Economia'           , u'http://www.europapress.es/rss/rss.aspx?ch=136')
+                ,(u'Deportes'            , u'http://www.europapress.es/rss/rss.aspx?ch=67')
+                ,(u'Cultura'               , u'http://www.europapress.es/rss/rss.aspx?ch=126')
+                ,(u'Sociedad'            , u'http://www.europapress.es/rss/rss.aspx?ch=73')
+                ,(u'Motor'                 , u'http://www.europapress.es/rss/rss.aspx?ch=435')
+                ,(u'CHANCE'             , u'http://www.europapress.es/rss/rss.aspx?ch=549')
+                ,(u'Comunicados'      , u'http://www.europapress.es/rss/rss.aspx?ch=137')
+             ]
+
--- a/resources/recipes/irish_times.recipe
+++ b/resources/recipes/irish_times.recipe
@ -35,7 +35,7 @@ class IrishTimes(BasicNewsRecipe):
    def print_version(self, url):
         if url.count('rss.feedsportal.com'):
            u = 'http://www.irishtimes.com' + \
-                     (((url[69:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html')
+                     (((url[70:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html')
         else:
             u = url.replace('.html','_pf.html')
         return u
--- a/resources/recipes/radio_prague.recipe
+++ b/resources/recipes/radio_prague.recipe
@ -0,0 +1,43 @@
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1291540961(BasicNewsRecipe):
+
+    title          = u'Radio Praha'
+    __author__             = 'Francois Pellicaan'
+    description            = 'News and information from and about The Czech republic. '
+    oldest_article = 7
+    max_articles_per_feed = 100
+    no_stylesheets         = True
+    use_embedded_content   = False
+    remove_empty_feeds     = True
+    encoding               = 'utf8'
+    publisher              = 'Radio Prague'
+    category               = 'News'
+    language               = 'en_CZ'
+    publication_type       = 'newsportal'
+
+    extra_css = 'h1 .section { display: block; text-transform: uppercase; font-size: 10px; margin-top: 4em; } \n .title { font-size: 14px; margin-top: 4em; } \n a.photo { display: block; clear:both; }  \n .caption { font-size: 9px; display: block; clear:both; padding:0px 0px 20px 0px; } \n a { font-type: normal; }'
+
+
+    keep_only_tags = [
+                       dict(name='div', attrs={'class':['main']})
+                        ]
+    remove_tags = [
+                       dict(name='div', attrs={'class':['cleaner', 'options', 'toolsXXL']}),
+       dict(name='ul', attrs={'class':['tools']})
+                        ]
+    feeds = [
+                      (u'Current Affairs', 'http://www.radio.cz/feeds/rss/en/themes/curraffrs.xml'),
+                      (u'Society', 'http://www.radio.cz/feeds/rss/en/themes/society.xml'),
+                      (u'European Union', 'http:http://www.radio.cz/feeds/rss/en/themes/eu.xml'),
+                      (u'Foreign policy', 'http://www.radio.cz/feeds/rss/en/themes/foreignpolicy.xml'),
+                      (u'Business', 'http://www.radio.cz/feeds/rss/en/themes/business.xml'),
+                      (u'Culture', 'http://www.radio.cz/feeds/rss/en/themes/culture.xml'),
+                      (u'Czechs abroad', 'http://www.radio.cz/feeds/rss/en/themes/czechabroad.xml'),
+                      (u'History', 'http://www.radio.cz/feeds/rss/en/themes/history.xml'),
+                      (u'Nature', 'http://www.radio.cz/feeds/rss/en/themes/nature.xml'),
+                      (u'Science', 'http://www.radio.cz/feeds/rss/en/themes/science.xml'),
+                      (u'Sport', 'http://www.radio.cz/feeds/rss/en/themes/sport.xml'),
+                      (u'Travel', 'http://www.radio.cz/feeds/rss/en/themes/travel.xml'),
+                    ]
--- a/resources/recipes/radio_praha.recipe
+++ b/resources/recipes/radio_praha.recipe
@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1291540961(BasicNewsRecipe):
+
+    title          = u'Radio Praha'
+    __author__             = 'Francois Pellicaan'
+    description            = u'Česká oficiální mezinárodní vysílací stanice.'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    no_stylesheets         = True
+    use_embedded_content   = False
+    remove_empty_feeds     = True
+    encoding               = 'utf8'
+    publisher              = u'Český rozhlas'
+    category               = 'News'
+    language               = 'cs'
+    publication_type       = 'newsportal'
+
+    extra_css = u'h1 .section { display: block; text-transform: uppercase; font-size: 10px; margin-top: 4em; } \n .title { font-size: 14px; margin-top: 4em; } \n a.photo { display: block; clear:both; }  \n .caption { font-size: 9px; display: block; clear:both; padding:0px 0px 20px 0px; } \n a { font-type: normal; }'
+
+
+    keep_only_tags = [
+                       dict(name='div', attrs={'class':['main']})
+                        ]
+    remove_tags = [
+                       dict(name='div', attrs={'class':['cleaner', 'options', 'toolsXXL']}),
+	   dict(name='ul', attrs={'class':['tools']})
+                        ]
+    feeds = [
+                      (u'Domácí politika', 'http://www.radio.cz/feeds/rss/cs/oblast/dompol.xml'),
+                      (u'Společnost', 'http://www.radio.cz/feeds/rss/cs/oblast/spolecnost.xml'),
+                      (u'Evropská unie', 'http://www.radio.cz/feeds/rss/cs/oblast/eu.xml'),
+                      (u'Zahraniční politika', 'http://www.radio.cz/feeds/rss/cs/oblast/zahrpol.xml'),
+                      (u'Ekonomika', 'http://www.radio.cz/feeds/rss/cs/oblast/ekonomika.xml'),
+                      (u'Kultura', 'http://www.radio.cz/feeds/rss/cs/oblast/kultura.xml'),
+                      (u'Krajané', 'http://www.radio.cz/feeds/rss/cs/oblast/krajane.xml'),
+                      (u'Historie', 'http://www.radio.cz/feeds/rss/cs/oblast/historie.xml'),
+                      (u'Příroda', 'http://www.radio.cz/feeds/rss/cs/oblast/priroda.xml'),
+                      (u'Věda', 'http://www.radio.cz/feeds/rss/cs/oblast/veda.xml'),
+                      (u'Sport', 'http://www.radio.cz/feeds/rss/cs/oblast/sport.xml'),
+                      (u'Cestování', 'http://www.radio.cz/feeds/rss/cs/oblast/cestovani.xml'),
+                    ]
--- a/src/calibre/ebooks/oeb/transforms/jacket.py
+++ b/src/calibre/ebooks/oeb/transforms/jacket.py
@ -15,6 +15,7 @@ from calibre import guess_type, strftime
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML
 from calibre.library.comments import comments_to_html
+from calibre.utils.date import is_date_undefined

 JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'

@ -130,7 +131,10 @@ def render_jacket(mi, output_profile,
        publisher = ''

    try:
-        pubdate = strftime(u'%Y', mi.pubdate.timetuple())
+        if is_date_undefined(mi.pubdate):
+            pubdate = ''
+        else:
+            pubdate = strftime(u'%Y', mi.pubdate.timetuple())
    except:
        pubdate = ''

@ -175,19 +179,24 @@ def render_jacket(mi, output_profile,
        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class':'cbj_series'})
-            series_tag.extract()
+            if series_tag is not None:
+                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class':'cbj_rating'})
-            rating_tag.extract()
+            if rating_tag is not None:
+                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class':'cbj_tags'})
-            tags_tag.extract()
+            if tags_tag is not None:
+                tags_tag.extract()
        if not pubdate:
-            pubdate_tag = soup.find(attrs={'class':'cbj_pubdate'})
-            pubdate_tag.extract()
+            pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
+            if pubdate_tag is not None:
+                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
-            hr_tag.extract()
+            if hr_tag is not None:
+                hr_tag.extract()

        return soup.renderContents(None)

--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    normalize_line_endings, convert_textile, remove_indents
+    normalize_line_endings, convert_textile, remove_indents, block_to_single_line
 from calibre import _ent_pat, xml_entity_to_unicode

 class TXTInput(InputFormatPlugin):
@ -99,14 +99,6 @@ class TXTInput(InputFormatPlugin):
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)

-        if options.txt_in_remove_indents:
-            txt = remove_indents(txt)
-
-        # Preserve spaces will replace multiple spaces to a space
-        # followed by the &nbsp; entity.
-        if options.preserve_spaces:
-            txt = preserve_spaces(txt)
-
        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
@ -114,6 +106,7 @@ class TXTInput(InputFormatPlugin):
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_paragraphs_print_formatted(txt)
+            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
@ -122,6 +115,8 @@ class TXTInput(InputFormatPlugin):
            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
+        else:
+            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
@ -130,6 +125,15 @@ class TXTInput(InputFormatPlugin):
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt,'txt', length)

+        # User requested transformation on the text.
+        if options.txt_in_remove_indents:
+            txt = remove_indents(txt)
+
+        # Preserve spaces will replace multiple spaces to a space
+        # followed by the &nbsp; entity.
+        if options.preserve_spaces:
+            txt = preserve_spaces(txt)
+
        # Process the text using the appropriate text processor.
        html = ''
        if options.formatting_type == 'markdown':
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'

 def clean_txt(txt):
+    '''
+    Run transformations on the text to put it into
+    consistent state.
+    '''
    if isbytestring(txt):
        txt = txt.decode('utf-8', 'replace')
    # Strip whitespace from the end of the line. Also replace
@ -42,6 +46,15 @@ def clean_txt(txt):
    return txt

 def split_txt(txt, epub_split_size_kb=0):
+    '''
+    Ensure there are split points for converting
+    to EPUB. A misdetected paragraph type can
+    result in the entire document being one giant
+    paragraph. In this case the EPUB parser will not
+    be able to determine where to split the file
+    to accomidate the EPUB file size limitation
+    and will fail.
+    '''
    #Takes care if there is no point to split
    if epub_split_size_kb > 0:
        if isinstance(txt, unicode):
@ -59,6 +72,12 @@ def split_txt(txt, epub_split_size_kb=0):
    return txt

 def convert_basic(txt, title='', epub_split_size_kb=0):
+    '''
+    Converts plain text to html by putting all paragraphs in
+    <p> tags. It condense and retains blank lines when necessary.
+    
+    Requires paragraphs to be in single line format.
+    '''
    txt = clean_txt(txt)
    txt = split_txt(txt, epub_split_size_kb)

@ -99,15 +118,25 @@ def separate_paragraphs_single_line(txt):
    return txt

 def separate_paragraphs_print_formatted(txt):
-    txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
+    txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
+    return txt
+
+def block_to_single_line(txt):
+    txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
    return txt

 def preserve_spaces(txt):
+    '''
+    Replaces spaces multiple spaces with &nbsp; entities.
+    '''
    txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + ('&nbsp;' * (len(mo.group('space')) - 1)), txt)
    txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
    return txt

 def remove_indents(txt):
+    '''
+    Remove whitespace at the beginning of each line.
+    '''
    txt = re.sub('(?miu)^\s+', '', txt)
    return txt

@ -118,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
    with open(os.path.join(path, opf_name), 'wb') as opffile:
        opf.render(opffile)

-def split_string_separator(txt, size) :
+def split_string_separator(txt, size):
+    '''
+    Splits the text by putting \n\n at the point size.
+    '''
    if len(txt) > size:
        txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
            txt[i:i+size], 1) for i in
@ -127,7 +159,7 @@ def split_string_separator(txt, size) :

 def detect_paragraph_type(txt):
    '''
-    Tries to determine the formatting of the document.
+    Tries to determine the paragraph type of the document.

    block: Paragraphs are separated by a blank line.
    single: Each line is a paragraph.
@ -170,6 +202,16 @@ def detect_paragraph_type(txt):


 def detect_formatting_type(txt):
+    '''
+    Tries to determine the formatting of the document.
+    
+    markdown: Markdown formatting is used.
+    textile: Textile formatting is used.
+    heuristic: When none of the above formatting types are
+               detected heuristic is returned.
+    '''
+    # Keep a count of the number of format specific object
+    # that are found in the text.
    markdown_count = 0
    textile_count = 0

@ -193,6 +235,8 @@ def detect_formatting_type(txt):
    # Links
    textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))

+    # Decide if either markdown or textile is used in the text
+    # based on the number of unique formatting elements found.
    if markdown_count > 5 or textile_count > 5:
        if markdown_count > textile_count:
            return 'markdown'
--- a/src/calibre/gui2/dialogs/message_box.py
+++ b/src/calibre/gui2/dialogs/message_box.py
@ -89,7 +89,8 @@ class MessageBox(QDialog, Ui_Dialog):
                (__version__, unicode(self.windowTitle()),
                    unicode(self.msg.text()),
                    unicode(self.det_msg.toPlainText())))
-        self.ctc_button.setText(_('Copied'))
+        if hasattr(self, 'ctc_button'):
+            self.ctc_button.setText(_('Copied'))

    def showEvent(self, ev):
        ret = QDialog.showEvent(self, ev)
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -414,7 +414,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        row = self.data._data[index] if index_is_id else self.data[index]
        return row[self.FIELD_MAP['path']].replace('/', os.sep)

-
    def abspath(self, index, index_is_id=False, create_dirs=True):
        'Return the absolute path to the directory containing this books files as a unicode string.'
        path = os.path.join(self.library_path, self.path(index, index_is_id=index_is_id))
@ -422,7 +421,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
            os.makedirs(path)
        return path

-
    def construct_path_name(self, id):
        '''
        Construct the directory name for this book based on its metadata.
@ -432,7 +430,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
            authors = _('Unknown')
        author = ascii_filename(authors.split(',')[0])[:self.PATH_LIMIT].decode(filesystem_encoding, 'replace')
        title  = ascii_filename(self.title(id, index_is_id=True))[:self.PATH_LIMIT].decode(filesystem_encoding, 'replace')
-        path   = author + '/' + title + ' (%d)'%id
+        while author[-1] in (' ', '.'):
+            author = author[:-1]
+        if not author:
+            author = ascii_filename(_('Unknown')).decode(filesystem_encoding, 'replace')
+        path = author + '/' + title + ' (%d)'%id
        return path

    def construct_file_name(self, id):