Merge from trunk

2025-08-30 23:00:21 -04:00 · 2011-02-11 10:44:40 +00:00 · 2011-02-11 10:44:40 +00:00 · 914ddaae86
commit 914ddaae86
parent 83cde4af65 c21d927caa
12 changed files with 993 additions and 645 deletions
--- a/resources/recipes/el_periodico.recipe
+++ b/resources/recipes/el_periodico.recipe
@ -5,8 +5,8 @@ __license__     = 'GPL v3'
 __copyright__   = '04 December 2010, desUBIKado'
 __author__      = 'desUBIKado'
 __description__ = 'Daily newspaper from Aragon'
-__version__     = 'v0.05'
-__date__        = '07, December 2010'
+__version__     = 'v0.07'
+__date__        = '06, February 2011'
 '''
 elperiodicodearagon.com
 '''
@ -38,22 +38,26 @@ class elperiodicodearagon(BasicNewsRecipe):
                            ,'publisher' : publisher
                         }

-    feeds              = [(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
-                          (u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
-                          (u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
-                          (u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
-                          (u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
-                          (u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
-                          (u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
-                          (u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
-                          (u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
-                          (u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')]
+    feeds              = [
+                           (u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
+                           (u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
+                           (u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
+                           (u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
+                           (u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
+                           (u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
+                           (u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
+                           (u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
+                           (u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
+                           (u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')
+                         ]


    extra_css = '''
-                    h3{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
-                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
-                    dd{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+                    h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
+                    h2 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}
+                    h4 {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:20px;}
+                    .columnaDeRecursosRelacionados {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
+                    img{margin-bottom: 0.4em}
 		'''

    remove_attributes = ['height','width']
@ -82,6 +86,7 @@ class elperiodicodearagon(BasicNewsRecipe):
                          dict(name='a', attrs={'class':'AvisoComentario'}),
                          dict(name='div', attrs={'class':'CajaAvisoComentario'}),
                          dict(name='div', attrs={'class':'navegaNoticias'}),
+                          dict(name='div', attrs={'class':'Mensaje'}),
                          dict(name='div', attrs={'id':'PaginadorDiCom'}),
                          dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
                          dict(name='div', attrs={'id':'CintilloComentario'}),
@ -107,3 +112,15 @@ class elperiodicodearagon(BasicNewsRecipe):
        (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'<p id="">', re.DOTALL|re.IGNORECASE), lambda match: '<p>')
        ]
+
+    # Para sustituir el video incrustado de YouTube por una imagen
+
+    def preprocess_html(self, soup):
+        for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
+            if video_yt:
+               video_yt.name = 'img'
+               fuente = video_yt['src']
+               fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
+               video_yt['src'] = fuente2 + '/0.jpg'
+
+        return soup
--- a/resources/recipes/tedneward.recipe
+++ b/resources/recipes/tedneward.recipe
@ -0,0 +1,33 @@
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
+'''
+blogs.tedneward.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class InteroperabilityHappens(BasicNewsRecipe):
+    title                 = 'Interoperability Happens'
+    __author__            = 'Darko Miletic'
+    description           = 'Tech blog by Ted Neward'
+    oldest_article        = 15
+    max_articles_per_feed = 100
+    language              = 'en'
+    encoding              = 'utf-8'
+    no_stylesheets        = True
+    use_embedded_content  = True
+    publication_type      = 'blog'
+    extra_css             = """
+                                body{font-family: Verdana,Arial,Helvetica,sans-serif}
+                            """
+
+    conversion_options = {
+                          'comment'  : description
+                        , 'tags'     : 'blog, technology, microsoft, programming, C#, Java'
+                        , 'publisher': 'Ted Neward'
+                        , 'language' : language
+                        }
+
+    feeds = [(u'Posts', u'http://blogs.tedneward.com/SyndicationService.asmx/GetRss')]
+
--- a/resources/recipes/weblogs_sl.recipe
+++ b/resources/recipes/weblogs_sl.recipe
@ -0,0 +1,104 @@
+#!/usr/bin/env  python
+__license__     = 'GPL v3'
+__copyright__   = '4 February 2011, desUBIKado'
+__author__      = 'desUBIKado'
+__version__     = 'v0.05'
+__date__        = '9, February 2011'
+'''
+http://www.weblogssl.com/
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class weblogssl(BasicNewsRecipe):
+    __author__     = 'desUBIKado'
+    description    = u'Weblogs colectivos dedicados a seguir la actualidad sobre tecnologia, entretenimiento, estilos de vida, motor, deportes y economia.'
+    title          = u'Weblogs SL (Xataka, Genbeta, VidaExtra, Blog de Cine y otros)'
+    publisher      = 'Weblogs SL'
+    category       = 'Gadgets, Tech news, Product reviews, mobiles, science, cinema, entertainment, culture, tv, food, recipes, life style, motor, F1, sports, economy'
+    language       = 'es'
+    timefmt        = '[%a, %d %b, %Y]'
+    oldest_article = 1.5
+    max_articles_per_feed = 100
+    encoding       = 'utf-8'
+    use_embedded_content  = False
+    remove_empty_feeds    = True
+    remove_javascript = True
+    no_stylesheets = True
+
+    # Si no se quiere recuperar todos los blogs se puede suprimir la descarga del que se desee poniendo
+    # un caracter # por delante, es decir,  # (u'Applesfera', u'http://feeds.weblogssl.com/applesfera'),
+    # haría que no se descargase Applesfera. OJO: El último feed no debe llevar la coma al final
+
+    feeds              = [
+                          (u'Xataka', u'http://feeds.weblogssl.com/xataka2'),
+                          (u'Xataka M\xf3vil', u'http://feeds.weblogssl.com/xatakamovil'),
+                          (u'Xataka Android', u'http://feeds.weblogssl.com/xatakandroid'),
+                          (u'Xataka Foto', u'http://feeds.weblogssl.com/xatakafoto'),
+                          (u'Xataka ON', u'http://feeds.weblogssl.com/xatakaon'),
+                          (u'Xataka Ciencia', u'http://feeds.weblogssl.com/xatakaciencia'),
+                          (u'Genbeta', u'http://feeds.weblogssl.com/genbeta'),
+                          (u'Applesfera', u'http://feeds.weblogssl.com/applesfera'),
+                          (u'Vida Extra', u'http://feeds.weblogssl.com/vidaextra'),
+                          (u'Naci\xf3n Red', u'http://feeds.weblogssl.com/nacionred'),
+                          (u'Blog de Cine', u'http://feeds.weblogssl.com/blogdecine'),
+                          (u'Vaya tele', u'http://feeds.weblogssl.com/vayatele2'),
+                          (u'Hipers\xf3nica', u'http://feeds.weblogssl.com/hipersonica'),
+                          (u'Diario del viajero', u'http://feeds.weblogssl.com/diariodelviajero'),
+                          (u'Papel en blanco', u'http://feeds.weblogssl.com/papelenblanco'),
+                          (u'Pop rosa', u'http://feeds.weblogssl.com/poprosa'),
+                          (u'Zona FandoM', u'http://feeds.weblogssl.com/zonafandom'),
+                          (u'Fandemia', u'http://feeds.weblogssl.com/fandemia'),
+                          (u'Noctamina', u'http://feeds.weblogssl.com/noctamina'),
+                          (u'Tendencias', u'http://feeds.weblogssl.com/trendencias'),
+                          (u'Beb\xe9s y m\xe1s', u'http://feeds.weblogssl.com/bebesymas'),
+                          (u'Directo al paladar', u'http://feeds.weblogssl.com/directoalpaladar'),
+                          (u'Compradicci\xf3n', u'http://feeds.weblogssl.com/compradiccion'),
+                          (u'Decoesfera', u'http://feeds.weblogssl.com/decoesfera'),
+                          (u'Embelezzia', u'http://feeds.weblogssl.com/embelezzia'),
+                          (u'Vit\xf3nica', u'http://feeds.weblogssl.com/vitonica'),
+                          (u'Ambiente G', u'http://feeds.weblogssl.com/ambienteg'),
+                          (u'Arrebatadora', u'http://feeds.weblogssl.com/arrebatadora'),
+                          (u'Mensencia', u'http://feeds.weblogssl.com/mensencia'),
+                          (u'Peques y m\xe1s', u'http://feeds.weblogssl.com/pequesymas'),
+                          (u'Motorpasi\xf3n', u'http://feeds.weblogssl.com/motorpasion'),
+                          (u'Motorpasi\xf3n F1', u'http://feeds.weblogssl.com/motorpasionf1'),
+                          (u'Motorpasi\xf3n Moto', u'http://feeds.weblogssl.com/motorpasionmoto'),
+                          (u'Notas de futbol', u'http://feeds.weblogssl.com/notasdefutbol'),
+                          (u'Fuera de l\xedmites', u'http://feeds.weblogssl.com/fueradelimites'),
+                          (u'Salir a ganar', u'http://feeds.weblogssl.com/saliraganar'),
+                          (u'El blog salm\xf3n', u'http://feeds.weblogssl.com/elblogsalmon2'),
+                          (u'Pymes y aut\xf3nomos', u'http://feeds.weblogssl.com/pymesyautonomos'),
+                          (u'Tecnolog\xeda Pyme', u'http://feeds.weblogssl.com/tecnologiapyme'),
+                          (u'Ahorro diario', u'http://feeds.weblogssl.com/ahorrodiario')
+                         ]
+
+
+    keep_only_tags     = [dict(name='div', attrs={'id':'infoblock'}),
+                          dict(name='div', attrs={'class':'post'}),
+                          dict(name='div', attrs={'id':'blog-comments'})
+                         ]
+
+    remove_tags        = [dict(name='div', attrs={'id':'comment-nav'})]
+
+    def print_version(self, url):
+          return url.replace('http://www.', 'http://m.')
+
+    preprocess_regexps = [
+                            # Para poner una linea en blanco entre un comentario y el siguiente
+                           (re.compile(r'<li id="c', re.DOTALL|re.IGNORECASE), lambda match: '<br><br><li id="c')
+                         ]
+
+    # Para sustituir el video incrustado de YouTube por una imagen
+
+    def preprocess_html(self, soup):
+        for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
+            if video_yt:
+               video_yt.name = 'img'
+               fuente = video_yt['src']
+               fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
+               fuente3 = fuente2.replace('?rel=0','')
+               video_yt['src'] = fuente3 + '/0.jpg'
+
+        return soup
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -83,7 +83,7 @@ class ANDROID(USBMS):
            'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
            'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
            'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT', 'A70H',
-            'IDEOS_TABLET', 'MYTOUCH_4G', 'UMS_COMPOSITE']
+            'IDEOS_TABLET', 'MYTOUCH_4G', 'UMS_COMPOSITE', 'SCH-I800_CARD']
    WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
            'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
            'A70S', 'A101IT']
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@ -39,6 +39,13 @@ def asfloat(value):
        return 0.0
    return float(value)

+def isspace(text):
+    if not text:
+        return True
+    if u'\xa0' in text:
+        return False
+    return text.isspace()
+
 class BlockState(object):
    def __init__(self, body):
        self.body = body
@ -438,7 +445,7 @@ class MobiMLizer(object):
        if elem.text:
            if istate.preserve:
                text = elem.text
-            elif len(elem) > 0 and elem.text.isspace():
+            elif len(elem) > 0 and isspace(elem.text):
                text = None
            else:
                text = COLLAPSE.sub(' ', elem.text)
@ -481,7 +488,7 @@ class MobiMLizer(object):
            if child.tail:
                if istate.preserve:
                    tail = child.tail
-                elif bstate.para is None and child.tail.isspace():
+                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(' ', child.tail)
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@ -70,7 +70,7 @@ class PML_HTMLizer(object):
        'c': ('<div style="text-align: center; margin: auto;">', '</div>'),
        'r': ('<div style="text-align: right;">', '</div>'),
        't': ('<div style="margin-left: 5%;">', '</div>'),
-        'T': ('<div style="margin-left: %s;">', '</div>'),
+        'T': ('<div style="text-indent: %s;">', '</div>'),
        'i': ('<span style="font-style: italic;">', '</span>'),
        'u': ('<span style="text-decoration: underline;">', '</span>'),
        'd': ('<span style="text-decoration: line-through;">', '</span>'),
@ -499,7 +499,13 @@ class PML_HTMLizer(object):
        self.toc = []
        self.file_name = file_name

-        indent_state = {'t': False, 'T': False}
+        # t: Are we in an open \t tag set?
+        # T: Are we in an open \T?
+        # st: Did the \t start the line?
+        # sT: Did the \T start the line?
+        # et: Did the \t end the line?
+        indent_state = {'t': False, 'T': False, 'st': False, 'sT': False, 'et': False}
+        basic_indent = False
        adv_indent_val = ''
        # Keep track of the number of empty lines
        # between paragraphs. When we reach a set number
@ -512,8 +518,26 @@ class PML_HTMLizer(object):
        for line in pml.splitlines():
            parsed = []
            empty = True
+
            basic_indent = indent_state['t']
-            adv_indent = indent_state['T']
+            indent_state['T'] = False
+            # Determine if the \t starts the line or if we are
+            # in an open \t block.
+            if line.lstrip().startswith('\\t') or basic_indent:
+                basic_indent = True
+                indent_state['st'] = True
+            else:
+                indent_state['st'] = False
+            # Determine if the \T starts the line.
+            if line.lstrip().startswith('\\T'):
+                indent_state['sT'] = True
+            else:
+                indent_state['sT'] = False
+            # Determine if the \t ends the line.
+            if line.rstrip().endswith('\\t'):
+                indent_state['et'] = True
+            else:
+                indent_state['et'] = False

            # Must use StringIO, cStringIO does not support unicode
            line = StringIO.StringIO(line)
@ -575,13 +599,10 @@ class PML_HTMLizer(object):
                        empty = False
                        text = '<hr width="%s" />' % self.code_value(line)
                    elif c == 't':
-                        indent_state[c] = not indent_state[c]
-                        if indent_state[c]:
-                            basic_indent = True
+                        indent_state['t'] = not indent_state['t']
                    elif c == 'T':
                        # Ensure we only store the value on the first T set for the line.
                        if not indent_state['T']:
-                            adv_indent = True
                            adv_indent_val = self.code_value(line)
                        else:
                            # We detected a T previously on this line.
@ -610,10 +631,23 @@ class PML_HTMLizer(object):
                text = self.end_line()
                parsed.append(text)
                
+                # Basic indent will be set if the \t starts the line or
+                # if we are in a continuing \t block.
                if basic_indent:
-                    parsed.insert(0, self.STATES_TAGS['t'][0])
-                    parsed.append(self.STATES_TAGS['t'][1])
-                elif adv_indent:
+                    # if the \t started the line and either it ended the line or the \t
+                    # block is still open use a left margin.
+                    if indent_state['st'] and (indent_state['et'] or indent_state['t']):
+                        parsed.insert(0, self.STATES_TAGS['t'][0])
+                        parsed.append(self.STATES_TAGS['t'][1])
+                    # Use a text indent instead of a margin.
+                    # This handles cases such as:
+                    # \tO\tne upon a time...
+                    else:
+                        parsed.insert(0, self.STATES_TAGS['T'][0] % '5%')
+                        parsed.append(self.STATES_TAGS['T'][1])
+                # \t will override \T's on the line.
+                # We only handle \T's that started the line.
+                elif indent_state['T'] and indent_state['sT']:
                    parsed.insert(0, self.STATES_TAGS['T'][0] % adv_indent_val)
                    parsed.append(self.STATES_TAGS['T'][1])
                    indent_state['T'] = False
--- a/src/calibre/gui2/preferences/tweaks.py
+++ b/src/calibre/gui2/preferences/tweaks.py
@ -227,8 +227,12 @@ class PluginTweaks(QDialog): # {{{
        self.highlighter = PythonHighlighter(self.edit.document())
        self.l = QVBoxLayout()
        self.setLayout(self.l)
-        self.l.addWidget(QLabel(
-            _('Add/edit tweaks for any custom plugins you have installed.')))
+        self.msg = QLabel(
+            _('Add/edit tweaks for any custom plugins you have installed. '
+                'Documentation for these tweaks should be available '
+                'on the website from where you downloaded the plugins.'))
+        self.msg.setWordWrap(True)
+        self.l.addWidget(self.msg)
        self.l.addWidget(self.edit)
        self.edit.setPlainText(raw)
        self.bb = QDialogButtonBox(QDialogButtonBox.Ok|QDialogButtonBox.Cancel,
--- a/src/calibre/gui2/viewer/documentview.py
+++ b/src/calibre/gui2/viewer/documentview.py
@ -440,16 +440,17 @@ class Document(QWebPage): # {{{

    @property
    def height(self):
-        j = self.javascript('document.body.offsetHeight', 'int')
+        # Note that document.body.offsetHeight does not include top and bottom
+        # margins on body and in some cases does not include the top margin on
+        # the first element inside body either. See ticket #8791 for an example
+        # of the latter.
        q = self.mainFrame().contentsSize().height()
-        if q == j:
-            return j
-        if min(j, q) <= 0:
-            return max(j, q)
-        window_height = self.window_height
-        if j == window_height:
-            return j if q < 1.2*j else q
-        return j
+        if q < 0:
+            # Don't know if this is still needed, but it can't hurt
+            j = self.javascript('document.body.offsetHeight', 'int')
+            if j >= 0:
+                q = j
+        return q

    @property
    def width(self):
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@ -561,9 +561,10 @@ format, whether input or output are available in the conversion dialog under the
 Convert Microsoft Word documents
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-|app| does not directly convert .doc files from Microsoft Word. However, in Word, you can save the document
+|app| does not directly convert .doc/.docx files from Microsoft Word. However, in Word, you can save the document
 as HTML and then convert the resulting HTML file with |app|. When saving as HTML, be sure to use the
-"Save as Web Page, Filtered" option as this will produce clean HTML that will convert well.
+"Save as Web Page, Filtered" option as this will produce clean HTML that will convert well. Note that Word
+produces really messy HTML, converting it can take a long time, so be patient.

 There is a Word macro package that can automate the conversion of Word documents using |app|. It also makes
 generating the Table of Contents much simpler. It is called BookCreator and is available for free
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot
--- a/src/calibre/utils/cleantext.py
+++ b/src/calibre/utils/cleantext.py
@ -8,11 +8,13 @@ import re, htmlentitydefs
 _ascii_pat = None

 def clean_ascii_chars(txt, charlist=None):
-    'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default'
+    '''
+    Remove ASCII control chars: 0 to 8 and 11, 12, 14-31 by default
+    This is all control chars except \\t,\\n and \\r
+    '''
    global _ascii_pat
    if _ascii_pat is None:
-        chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \
-            + [0x1A, 0x1B]
+        chars = list(range(8)) + [0x0B, 0x0C] + list(range(0x0E, 0x1F))
        _ascii_pat = re.compile(u'|'.join(map(unichr, chars)))

    if charlist is None:
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -13,6 +13,7 @@ from calibre.web.feeds.feedparser import parse
 from calibre.utils.logging import default_log
 from calibre import entity_to_unicode, strftime
 from calibre.utils.date import dt_factory, utcnow, local_tz
+from calibre.utils.cleantext import clean_ascii_chars

 class Article(object):

@ -43,7 +44,7 @@ class Article(object):
                print summary.encode('utf-8')
                traceback.print_exc()
                summary = u''
-        self.text_summary = summary
+        self.text_summary = clean_ascii_chars(summary)
        self.author = author
        self.content = content
        self.date = published