Merge from trunk

This commit is contained in:
Charles Haley 2011-11-14 08:23:12 +01:00
commit 6859477f1f
15 changed files with 224 additions and 33 deletions

View File

@ -27,7 +27,7 @@ class CGM(BasicNewsRecipe):
del item['style'] del item['style']
ad=soup.findAll('a') ad=soup.findAll('a')
for r in ad: for r in ad:
if 'http://www.hustla.pl' in r['href']: if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:
r.extract() r.extract()
gallery=soup.find('div', attrs={'class':'galleryFlash'}) gallery=soup.find('div', attrs={'class':'galleryFlash'})
if gallery: if gallery:

View File

@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe):
__author__ = 'fluzao' __author__ = 'fluzao'
description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
INDEX = 'http://www1.folha.uol.com.br/fsp/indices/'
#found this to be the easiest place to find the index page (13-Nov-2011).
# searching for the "Indice Geral" link
HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
language = 'pt' language = 'pt'
no_stylesheets = True no_stylesheets = True
max_articles_per_feed = 40 max_articles_per_feed = 40
remove_javascript = True remove_javascript = True
needs_subscription = True needs_subscription = True
remove_tags_before = dict(name='b')
remove_tags_before = dict(name='p')
remove_tags = [dict(name='td', attrs={'align':'center'})] remove_tags = [dict(name='td', attrs={'align':'center'})]
remove_attributes = ['height','width'] remove_attributes = ['height','width']
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
# fixes the problem with the section names # fixes the problem with the section names
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \ section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \ 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \ 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'} 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \
'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \
'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'}
# this solves the problem with truncated content in Kindle # this solves the problem with truncated content in Kindle
conversion_options = {'linearize_tables' : True} conversion_options = {'linearize_tables' : True}
# this bit removes the footer where there are links for Proximo Texto, Texto Anterior, # this bit removes the footer where there are links for Proximo Texto, Texto Anterior,
# Indice e Comunicar Erros # Indice e Comunicar Erros
preprocess_regexps = [(re.compile(r'<BR><BR>Texto Anterior:.*<!--/NOTICIA-->', preprocess_regexps = [(re.compile(r'<!--/NOTICIA-->.*Comunicar Erros</a>',
re.DOTALL|re.IGNORECASE), lambda match: r''),
(re.compile(r'<BR><BR>Pr&oacute;ximo Texto:.*<!--/NOTICIA-->',
re.DOTALL|re.IGNORECASE), lambda match: r'')] re.DOTALL|re.IGNORECASE), lambda match: r'')]
def get_browser(self): def get_browser(self):
@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.INDEX) #Searching for the index page on the HOMEPAGE
hpsoup = self.index_to_soup(self.HOMEPAGE)
indexref = hpsoup.find('a', href=re.compile('^indices.*'))
self.log('--> tag containing the today s index: ', indexref)
INDEX = indexref['href']
INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX
self.log('--> INDEX after extracting href and adding prefix: ', INDEX)
# ... and taking the opportunity to get the cover image link
coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
if coverurl:
self.log('--> tag containing the today s cover: ', coverurl)
coverurl = coverurl.replace('htm', 'jpg')
coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl
self.log('--> coverurl after extracting href and adding prefix: ', coverurl)
self.cover_url = coverurl
#soup = self.index_to_soup(self.INDEX)
soup = self.index_to_soup(INDEX)
feeds = [] feeds = []
articles = [] articles = []
section_title = "Preambulo" section_title = "Preambulo"
@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe):
self.log('--> new section title: ', section_title) self.log('--> new section title: ', section_title)
if strpost.startswith('<a href'): if strpost.startswith('<a href'):
url = post['href'] url = post['href']
#this bit is kept if they ever go back to the old format (pre Nov-2011)
if url.startswith('/fsp'): if url.startswith('/fsp'):
url = 'http://www1.folha.uol.com.br'+url url = 'http://www1.folha.uol.com.br'+url
#
if url.startswith('http://www1.folha.uol.com.br/fsp'):
#url = 'http://www1.folha.uol.com.br'+url
title = self.tag_to_string(post) title = self.tag_to_string(post)
self.log() self.log()
self.log('--> post: ', post) self.log('--> post: ', post)
@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe):
# keeping the front page url # keeping the front page url
minha_capa = feeds[0][1][1]['url'] minha_capa = feeds[0][1][1]['url']
# removing the 'Preambulo' section # removing the first section (now called 'top')
del feeds[0] del feeds[0]
# creating the url for the cover image
coverurl = feeds[0][1][0]['url']
coverurl = coverurl.replace('/opiniao/fz', '/images/cp')
coverurl = coverurl.replace('01.htm', '.jpg')
self.cover_url = coverurl
# inserting the cover page as the first article (nicer for kindle users) # inserting the cover page as the first article (nicer for kindle users)
feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}])) feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
return feeds return feeds

View File

@ -8,6 +8,15 @@ class Historia_org_pl(BasicNewsRecipe):
category = 'history' category = 'history'
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8
remove_empty_feeds=True
max_articles_per_feed = 100 max_articles_per_feed = 100
feeds = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')] feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'),
(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'),
(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'),
(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'),
(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'),
(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'),
(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'),
(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'),
(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')]

BIN
recipes/icons/infra_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB

17
recipes/infra_pl.recipe Normal file
View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class INFRA(BasicNewsRecipe):
title = u'INFRA'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
category = 'UFO'
language = 'pl'
max_articles_per_feed = 100
no_stylesheers=True
remove_tags_before=dict(name='h2', attrs={'class':'contentheading'})
remove_tags_after=dict(attrs={'class':'pagenav'})
remove_tags=[dict(attrs={'class':'pagenav'})]
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]

BIN
recipes/spiders_web_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 605 B

View File

@ -0,0 +1,15 @@
from calibre.web.feeds.news import BasicNewsRecipe
class SpidersWeb(BasicNewsRecipe):
title = u"Spider's Web"
oldest_article = 7
__author__ = 'fenuks'
description = u''
cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg'
category = 'IT, WEB'
language = 'pl'
max_articles_per_feed = 100
remove_tags_before=dict(name="h1", attrs={'class':'Title'})
remove_tags_after=dict(name="div", attrs={'class':'Text'})
remove_tags=[dict(name='div', attrs={'class':['Tags', 'CommentCount FloatL', 'Show FloatL']})]
feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')]

View File

@ -9,6 +9,7 @@ class Tablety_pl(BasicNewsRecipe):
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
keep_only_tags=[dict(name='header', attrs={'class':'entry-header'}), dict(name='div', attrs={'class':'entry-content clearfix'})] remove_tags_before=dict(name="h1", attrs={'class':'entry-title'})
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'}), dict(name='span', attrs={'class':'dsq-postid'})] remove_tags_after=dict(name="div", attrs={'class':'snap_nopreview sharing robots-nocontent'})
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'})]
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]

View File

@ -106,7 +106,7 @@ class ANDROID(USBMS):
0x61c5 : [0x100, 0x226, 0x9999], 0x61c5 : [0x100, 0x226, 0x9999],
0x61cc : [0x100], 0x61cc : [0x100],
0x61ce : [0x100], 0x61ce : [0x100],
0x618e : [0x226, 0x9999, 0x100] 0x618e : [0x226, 0x227, 0x9999, 0x100]
}, },
# Archos # Archos

View File

@ -12,7 +12,7 @@ from urllib import urlencode
from threading import Thread from threading import Thread
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml.html import soupparser, tostring from lxml.html import tostring
from calibre import as_unicode from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
@ -23,6 +23,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
from calibre.utils.date import parse_date from calibre.utils.date import parse_date
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.soupparser import fromstring
class Worker(Thread): # Get details {{{ class Worker(Thread): # Get details {{{
@ -199,7 +200,7 @@ class Worker(Thread): # Get details {{{
return return
try: try:
root = soupparser.fromstring(clean_ascii_chars(raw)) root = fromstring(clean_ascii_chars(raw))
except: except:
msg = 'Failed to parse amazon details page: %r'%self.url msg = 'Failed to parse amazon details page: %r'%self.url
self.log.exception(msg) self.log.exception(msg)
@ -623,7 +624,7 @@ class Amazon(Source):
if found: if found:
try: try:
root = soupparser.fromstring(clean_ascii_chars(raw)) root = fromstring(clean_ascii_chars(raw))
except: except:
msg = 'Failed to parse amazon page for query: %r'%query msg = 'Failed to parse amazon page for query: %r'%query
log.exception(msg) log.exception(msg)

View File

@ -14,13 +14,13 @@ from threading import RLock
from Queue import Queue, Empty from Queue import Queue, Empty
from lxml import html from lxml import html
from lxml.html import soupparser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata.sources.base import Source, Option
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
from calibre.utils.soupparser import fromstring
ovrdrv_data_cache = {} ovrdrv_data_cache = {}
cache_lock = RLock() cache_lock = RLock()
@ -403,7 +403,7 @@ class OverDrive(Source):
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
try: try:
root = soupparser.fromstring(raw) root = fromstring(raw)
except: except:
return False return False

View File

@ -353,14 +353,14 @@ class MobiReader(object):
self.processed_html = self.remove_random_bytes(self.processed_html) self.processed_html = self.remove_random_bytes(self.processed_html)
root = html.fromstring(self.processed_html) root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'): if root.xpath('descendant::p/descendant::p'):
from lxml.html import soupparser from calibre.utils.soupparser import fromstring
self.log.warning('Malformed markup, parsing using BeautifulSoup') self.log.warning('Malformed markup, parsing using BeautifulSoup')
try: try:
root = soupparser.fromstring(self.processed_html) root = fromstring(self.processed_html)
except Exception: except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html) self.processed_html = self.remove_random_bytes(self.processed_html)
root = soupparser.fromstring(self.processed_html) root = fromstring(self.processed_html)
if root.tag != 'html': if root.tag != 'html':
self.log.warn('File does not have opening <html> tag') self.log.warn('File does not have opening <html> tag')

View File

@ -894,8 +894,8 @@ class Manifest(object):
except etree.XMLSyntaxError as err: except etree.XMLSyntaxError as err:
self.oeb.logger.warn('Parsing file %r as HTML' % self.href) self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
if err.args and err.args[0].startswith('Excessive depth'): if err.args and err.args[0].startswith('Excessive depth'):
from lxml.html import soupparser from calibre.utils.soupparser import fromstring
data = soupparser.fromstring(data) data = fromstring(data)
else: else:
data = html.fromstring(data) data = html.fromstring(data)
data.attrib.pop('xmlns', None) data.attrib.pop('xmlns', None)

View File

@ -8,7 +8,6 @@ __docformat__ = 'restructuredtext en'
import re, os import re, os
from lxml import html from lxml import html
from lxml.html import soupparser
from PyQt4.Qt import QApplication, QFontInfo, QSize, QWidget, QPlainTextEdit, \ from PyQt4.Qt import QApplication, QFontInfo, QSize, QWidget, QPlainTextEdit, \
QToolBar, QVBoxLayout, QAction, QIcon, Qt, QTabWidget, QUrl, \ QToolBar, QVBoxLayout, QAction, QIcon, Qt, QTabWidget, QUrl, \
@ -19,6 +18,7 @@ from PyQt4.QtWebKit import QWebView, QWebPage
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre import xml_replace_entities from calibre import xml_replace_entities
from calibre.gui2 import open_url from calibre.gui2 import open_url
from calibre.utils.soupparser import fromstring
class PageAction(QAction): # {{{ class PageAction(QAction): # {{{
@ -227,7 +227,7 @@ class EditorWidget(QWebView): # {{{
try: try:
root = html.fromstring(raw) root = html.fromstring(raw)
except: except:
root = soupparser.fromstring(raw) root = fromstring(raw)
elems = [] elems = []
for body in root.xpath('//body'): for body in root.xpath('//body'):

View File

@ -0,0 +1,126 @@
__doc__ = """External interface to the BeautifulSoup HTML parser.
"""
__all__ = ["fromstring", "parse", "convert_tree"]
from lxml import etree, html
from calibre.ebooks.BeautifulSoup import \
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a string of HTML data into an Element tree using the
BeautifulSoup parser.
Returns the root ``<html>`` Element of the tree.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
return _parse(data, beautifulsoup, makeelement, **bsargs)
def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a file into an ElemenTree using the BeautifulSoup parser.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
if not hasattr(file, 'read'):
file = open(file)
root = _parse(file, beautifulsoup, makeelement, **bsargs)
return etree.ElementTree(root)
def convert_tree(beautiful_soup_tree, makeelement=None):
"""Convert a BeautifulSoup tree to a list of Element trees.
Returns a list instead of a single root Element to support
HTML-like soup with more than one root element.
You can pass a different Element factory through the `makeelement`
keyword.
"""
if makeelement is None:
makeelement = html.html_parser.makeelement
root = _convert_tree(beautiful_soup_tree, makeelement)
children = root.getchildren()
for child in children:
root.remove(child)
return children
# helpers
def _parse(source, beautifulsoup, makeelement, **bsargs):
if beautifulsoup is None:
beautifulsoup = BeautifulSoup
if makeelement is None:
makeelement = html.html_parser.makeelement
if 'convertEntities' not in bsargs:
bsargs['convertEntities'] = 'html'
tree = beautifulsoup(source, **bsargs)
root = _convert_tree(tree, makeelement)
# from ET: wrap the document in a html root element, if necessary
if len(root) == 1 and root[0].tag == "html":
return root[0]
root.tag = "html"
return root
def _convert_tree(beautiful_soup_tree, makeelement):
root = makeelement(beautiful_soup_tree.name,
attrib=dict(beautiful_soup_tree.attrs))
_convert_children(root, beautiful_soup_tree, makeelement)
return root
def _convert_children(parent, beautiful_soup_tree, makeelement):
SubElement = etree.SubElement
et_child = None
for child in beautiful_soup_tree:
if isinstance(child, Tag):
et_child = SubElement(parent, child.name, attrib=dict(
[(k, unescape(v)) for (k,v) in child.attrs]))
_convert_children(et_child, child, makeelement)
elif type(child) is NavigableString:
_append_text(parent, et_child, unescape(child))
else:
if isinstance(child, Comment):
parent.append(etree.Comment(child))
elif isinstance(child, ProcessingInstruction):
parent.append(etree.ProcessingInstruction(
*child.split(' ', 1)))
else: # CData
_append_text(parent, et_child, unescape(child))
def _append_text(parent, element, text):
if element is None:
parent.text = (parent.text or '') + text
else:
element.tail = (element.tail or '') + text
# copied from ET's ElementSoup
try:
from html.entities import name2codepoint # Python 3
name2codepoint
except ImportError:
from htmlentitydefs import name2codepoint
import re
handle_entities = re.compile("&(\w+);").sub
def unescape(string):
if not string:
return ''
# work around oddities in BeautifulSoup's entity handling
def unescape_entity(m):
try:
return unichr(name2codepoint[m.group(1)])
except KeyError:
return m.group(0) # use as is
return handle_entities(unescape_entity, string)