Sync to trunk.

This commit is contained in:
John Schember 2009-08-19 18:54:19 -04:00
commit 28d5eff2f8
14 changed files with 454 additions and 89 deletions

View File

@ -18,13 +18,12 @@ import re
import sys
import glob
from itertools import repeat
from math import ceil
from calibre.devices.interface import DevicePlugin
from calibre.devices.errors import DeviceError, FreeSpaceError
from calibre.devices.usbms.deviceconfig import DeviceConfig
from calibre import iswindows, islinux, isosx, __appname__
from calibre.utils.filenames import ascii_filename as sanitize
from calibre.utils.filenames import ascii_filename as sanitize, shorten_components_to
class Device(DeviceConfig, DevicePlugin):
@ -669,71 +668,47 @@ class Device(DeviceConfig, DevicePlugin):
return path
def create_upload_path(self, path, mdata, fname):
resizable = []
path = os.path.abspath(path)
newpath = path
if self.SUPPORTS_SUB_DIRS and self.settings().use_subdirs:
extra_components = []
if self.SUPPORTS_SUB_DIRS and self.settings().use_subdirs:
if 'tags' in mdata.keys():
for tag in mdata['tags']:
if tag.startswith(_('News')):
newpath = os.path.join(newpath, 'news')
extra_components.append('news')
c = sanitize(mdata.get('title', ''))
if c:
newpath = os.path.join(newpath, c)
resizable.append(c)
extra_components.append(c)
c = sanitize(mdata.get('timestamp', ''))
if c:
newpath = os.path.join(newpath, c)
resizable.append(c)
extra_components.append(c)
break
elif tag.startswith('/'):
for c in tag.split('/'):
c = sanitize(c)
if not c: continue
newpath = os.path.join(newpath, c)
resizable.append(c)
extra_components.append(c)
break
if newpath == path:
if not extra_components:
c = sanitize(mdata.get('authors', _('Unknown')))
if c:
newpath = os.path.join(newpath, c)
resizable.append(c)
extra_components.append(c)
c = sanitize(mdata.get('title', _('Unknown')))
if c:
extra_components.append(c)
newpath = os.path.join(newpath, c)
resizable.append(c)
newpath = os.path.abspath(newpath)
fname = sanitize(fname)
resizable.append(fname)
extra_components.append(fname)
extra_components = [str(x) for x in extra_components]
components = shorten_components_to(250 - len(path), extra_components)
filepath = os.path.join(path, *components)
filedir = os.path.dirname(filepath)
filepath = os.path.join(newpath, fname)
if len(filepath) > 245:
extra = len(filepath) - 245
delta = int(ceil(extra/float(len(resizable))))
for x in resizable:
if delta > len(x):
r = x[0] if x is resizable[-1] else ''
else:
if x is resizable[-1]:
b, e = os.path.splitext(x)
r = b[:-delta]+e
if r.startswith('.'): r = x[0]+r
else:
r = x[:-delta]
r = r.strip()
if not r:
r = x.strip()[0] if x.strip() else 'x'
if x is resizable[-1]:
filepath = filepath.replace(os.sep+x, os.sep+r)
else:
filepath = filepath.replace(os.sep+x+os.sep, os.sep+r+os.sep)
filepath = filepath.replace(os.sep+os.sep, os.sep).strip()
newpath = os.path.dirname(filepath)
if not os.path.exists(newpath):
os.makedirs(newpath)
if not os.path.exists(filedir):
os.makedirs(filedir)
return filepath

View File

@ -45,7 +45,7 @@ class DBAdder(Thread):
self.critical = {}
self.number_of_books_added = 0
self.duplicates = []
self.names, self.path, self.infos = [], [], []
self.names, self.paths, self.infos = [], [], []
Thread.__init__(self)
self.daemon = True
self.input_queue = Queue()

Binary file not shown.

After

Width:  |  Height:  |  Size: 670 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 670 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 731 B

View File

@ -1445,36 +1445,40 @@ class LibraryDatabase2(LibraryDatabase):
self.notify('add', [id])
def move_library_to(self, newloc, progress=lambda x: x):
books = self.conn.get('SELECT id, path, title FROM books')
if not os.path.exists(newloc):
os.makedirs(newloc)
items = os.listdir(self.library_path)
old_dirs = set([])
for i, book in enumerate(books):
path = book[1]
if not path:
continue
dir = path.split('/')[0]
srcdir = os.path.join(self.library_path, dir)
tdir = os.path.join(newloc, dir)
if os.path.exists(tdir):
shutil.rmtree(tdir)
if os.path.exists(srcdir):
shutil.copytree(srcdir, tdir)
old_dirs.add(srcdir)
progress(book[2])
for i, x in enumerate(items):
src = os.path.join(self.library_path, x)
dest = os.path.join(newloc, x)
if os.path.isdir(src):
if os.path.exists(dest):
shutil.rmtree(dest)
shutil.copytree(src, dest)
old_dirs.add(src)
else:
if os.path.exists(dest):
os.remove(dest)
shutil.copyfile(src, dest)
if not isinstance(x, unicode):
x = x.decode(filesystem_encoding, 'replace')
progress(x)
dbpath = os.path.join(newloc, os.path.basename(self.dbpath))
shutil.copyfile(self.dbpath, dbpath)
opath = self.dbpath
self.conn.close()
self.library_path, self.dbpath = newloc, dbpath
self.connect()
try:
os.unlink(opath)
for dir in old_dirs:
shutil.rmtree(dir)
except:
pass
for dir in old_dirs:
try:
shutil.rmtree(dir)
except:
pass
def __iter__(self):
for record in self.data._data:
@ -1639,9 +1643,9 @@ books_series_link feeds
def import_book_directory(self, dirpath, callback=None):
dirpath = os.path.abspath(dirpath)
formats = self.find_books_in_directory(dirpath, True)
formats = list(formats)[0]
if not formats:
return
formats = list(iter(formats))
mi = metadata_from_formats(formats)
if mi.title is None:
return

View File

@ -31,7 +31,7 @@ from calibre.library.database2 import LibraryDatabase2, FIELD_MAP
from calibre.utils.config import config_dir
from calibre.utils.mdns import publish as publish_zeroconf, \
stop_server as stop_zeroconf
from calibre.ebooks.metadata import fmt_sidx
from calibre.ebooks.metadata import fmt_sidx, title_sort
build_time = datetime.strptime(build_time, '%d %m %Y %H%M%S')
server_resources['jquery.js'] = jquery
@ -125,6 +125,41 @@ class LibraryServer(object):
</feed>
'''))
STANZA_MAIN = MarkupTemplate(textwrap.dedent('''\
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:py="http://genshi.edgewall.org/">
<title>calibre Library</title>
<id>$id</id>
<updated>${updated.strftime('%Y-%m-%dT%H:%M:%S+00:00')}</updated>
<link rel="search" title="Search" type="application/atom+xml" href="/?search={searchTerms}"/>
<author>
<name>calibre</name>
<uri>http://calibre.kovidgoyal.net</uri>
</author>
<subtitle>
${subtitle}
</subtitle>
<entry>
<title>By Author</title>
<id>urn:uuid:fc000fa0-8c23-11de-a31d-0002a5d5c51b</id>
<updated>${updated.strftime('%Y-%m-%dT%H:%M:%S+00:00')}</updated>
<link type="application/atom+xml" href="/?sortby=byauthor" />
</entry>
<entry>
<title>By Title</title>
<id>urn:uuid:1df4fe40-8c24-11de-b4c6-0002a5d5c51b</id>
<updated>${updated.strftime('%Y-%m-%dT%H:%M:%S+00:00')}</updated>
<link type="application/atom+xml" href="/?sortby=bytitle" />
</entry>
<entry>
<title>By Newest</title>
<id>urn:uuid:3c6d4940-8c24-11de-a4d7-0002a5d5c51b</id>
<updated>${updated.strftime('%Y-%m-%dT%H:%M:%S+00:00')}</updated>
<link type="application/atom+xml" href="/?sortby=bynewest" />
</entry>
</feed>
'''))
def __init__(self, db, opts, embedded=False, show_tracebacks=True):
self.db = db
@ -295,11 +330,25 @@ class LibraryServer(object):
@expose
def stanza(self, search=None):
def stanza(self, search=None, sortby=None):
'Feeds to read calibre books on a ipod with stanza.'
books = []
updated = self.db.last_modified()
cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
cherrypy.response.headers['Content-Type'] = 'text/xml'
if not sortby and not search:
return self.STANZA_MAIN.generate(subtitle='', data=books, FM=FIELD_MAP,
updated=updated, id='urn:calibre:main').render('xml')
ids = self.db.data.parse(search) if search and search.strip() else self.db.data.universal_set()
for record in reversed(list(iter(self.db))):
record_list = list(iter(self.db))
if sortby == "byauthor":
record_list.sort(lambda x, y: cmp(x[FIELD_MAP['author_sort']], y[FIELD_MAP['author_sort']]))
elif sortby == "bytitle":
record_list.sort(lambda x, y: cmp(title_sort(x[FIELD_MAP['title']]),
title_sort(y[FIELD_MAP['title']])))
else:
record_list = reversed(record_list)
for record in record_list:
if record[0] not in ids: continue
r = record[FIELD_MAP['formats']]
r = r.upper() if r else ''
@ -335,10 +384,6 @@ class LibraryServer(object):
timestamp=strftime('%Y-%m-%dT%H:%M:%S+00:00', record[5]),
).render('xml').decode('utf8'))
updated = self.db.last_modified()
cherrypy.response.headers['Last-Modified'] = self.last_modified(updated)
cherrypy.response.headers['Content-Type'] = 'text/xml'
return self.STANZA.generate(subtitle='', data=books, FM=FIELD_MAP,
updated=updated, id='urn:calibre:main').render('xml')
@ -389,7 +434,7 @@ class LibraryServer(object):
'The / URL'
want_opds = cherrypy.request.headers.get('Stanza-Device-Name', 919) != \
919 or cherrypy.request.headers.get('Want-OPDS-Catalog', 919) != 919
return self.stanza(search=kwargs.get('search', None)) if want_opds else self.static('index.html')
return self.stanza(search=kwargs.get('search', None), sortby=kwargs.get('sortby',None)) if want_opds else self.static('index.html')
@expose

View File

@ -54,6 +54,7 @@ recipe_modules = ['recipe_' + r for r in (
'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna',
'eltiempo_hn', 'slate', 'tnxm', 'bbcvietnamese', 'vnexpress',
'volksrant', 'theeconomictimes_india', 'ourdailybread',
'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti',
)]

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
beta.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Danas(BasicNewsRecipe):
title = 'BETA'
__author__ = 'Darko Miletic'
description = 'Novinska Agencija'
publisher = 'Beta'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = True
language = _('Serbian')
lang = 'sr-Latn-RS'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [
(u'Vesti dana', u'http://www.beta.rs/rssvd.asp')
,(u'Ekonomija' , u'http://www.beta.rs/rssek.asp')
,(u'Sport' , u'http://www.beta.rs/rsssp.asp')
]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)

View File

@ -0,0 +1,37 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
beta.rs
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Danas(BasicNewsRecipe):
title = 'BETA - English'
__author__ = 'Darko Miletic'
description = 'Serbian news agency'
publisher = 'Beta'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = True
language = _('English')
lang = 'en'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
feeds = [(u'News', u'http://www.beta.rs/rssen.asp')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.glas-javnosti.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GlasJavnosti(BasicNewsRecipe):
title = 'Glas Javnosti'
__author__ = 'Darko Miletic'
description = 'Glas javnosti - Mi ne ulepsavamo stvarnost'
publisher = 'Glas Javnosti'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
language = _('Serbian')
lang = 'sr-Latn-RS'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [
dict(name='div', attrs={'id':'above-content'})
,dict(name='div', attrs={'class':'node' })
]
remove_tags = [
dict(name=['object','link'])
,dict(name='div',attrs={'class':['links','meta']})
,dict(name='div',attrs={'id':'block-block-12'})
]
feeds = [
(u'Politika', u'http://www.glas-javnosti.rs/aktuelni-clanci/2')
,(u'Tema', u'http://www.glas-javnosti.rs/aktuelni-clanci/48')
,(u'Drustvo', u'http://www.glas-javnosti.rs/aktuelni-clanci/17')
,(u'Ekonomija', u'http://www.glas-javnosti.rs/aktuelni-clanci/16')
,(u'Dosije', u'http://www.glas-javnosti.rs/aktuelni-clanci/65')
,(u'Svet', u'http://www.glas-javnosti.rs/aktuelni-clanci/18')
,(u'Hronika', u'http://www.glas-javnosti.rs/aktuelni-clanci/19')
,(u'Kultura', u'http://www.glas-javnosti.rs/aktuelni-clanci/6')
,(u'Ljudi i Dogadjaji', u'http://www.glas-javnosti.rs/aktuelni-clanci/37')
,(u'Putovanja', u'http://www.glas-javnosti.rs/aktuelni-clanci/113')
,(u'Feljton', u'http://www.glas-javnosti.rs/aktuelni-clanci/49')
,(u'Sport', u'http://www.glas-javnosti.rs/aktuelni-clanci/1')
,(u'Lov i Ribolov', u'http://www.glas-javnosti.rs/aktuelni-clanci/591')
,(u'Nedelja', u'http://www.glas-javnosti.rs/aktuelni-clanci/1862')
,(u'Glasno', u'http://www.glas-javnosti.rs/aktuelni-clanci/590')
,(u'Tehnologija', u'http://www.glas-javnosti.rs/aktuelni-clanci/609')
,(u'Reflektor', u'http://www.glas-javnosti.rs/aktuelni-clanci/717')
,(u'Saznanja', u'http://www.glas-javnosti.rs/aktuelni-clanci/1694')
,(u'Beograd', u'http://www.glas-javnosti.rs/aktuelni-clanci/40')
,(u'Srbija', u'http://www.glas-javnosti.rs/aktuelni-clanci/114')
,(u'Zapadna Srbija', u'http://www.glas-javnosti.rs/aktuelni-clanci/41')
,(u'Istocna i Juzna Srbija', u'http://www.glas-javnosti.rs/aktuelni-clanci/42')
,(u'Sumadija i Pomoravlje', u'http://www.glas-javnosti.rs/aktuelni-clanci/43')
,(u'Vojvodina', u'http://www.glas-javnosti.rs/aktuelni-clanci/44')
,(u'Republika Srpska', u'http://www.glas-javnosti.rs/aktuelni-clanci/45')
,(u'Slobodno Vreme', u'http://www.glas-javnosti.rs/aktuelni-clanci/61')
,(u'Konjske Snage', u'http://www.glas-javnosti.rs/aktuelni-clanci/46')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -8,17 +8,16 @@ www.guardian.co.uk
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Guardian(BasicNewsRecipe):
title = u'The Guardian'
__author__ = 'Seabound'
__author__ = 'Seabound and Sujata Raman'
language = _('English')
oldest_article = 7
max_articles_per_feed = 20
remove_javascript = True
timefmt = ' [%a, %d %b %Y]'
keep_only_tags = [
dict(name='div', attrs={'id':["content","article_header","main-article-info",]}),
@ -30,20 +29,20 @@ class Guardian(BasicNewsRecipe):
dict(name='ul', attrs={'id':["content-actions"]}),
]
use_embedded_content = False
no_stylesheets = True
extra_css = '''
.article-attributes{font-size: x-small; font-family:Arial,Helvetica,sans-serif;}
.h1{font-size: large ;font-family:georgia,serif; font-weight:bold;}
.stand-first-alone{color:#666666; font-size:small; font-family:Arial,Helvetica,sans-serif;}
.caption{color:#666666; font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
#article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;}
#article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
.main-article-info{font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
'''
feeds = [
('Front Page', 'http://www.guardian.co.uk/rss'),
@ -57,21 +56,30 @@ class Guardian(BasicNewsRecipe):
('Comment','http://www.guardian.co.uk/commentisfree/rss'),
]
def get_article_url(self, article):
url = article.get('guid', None)
if '/video/' in url or '/flyer/' in url or '/quiz/' in url or \
'/gallery/' in url or 'ivebeenthere' in url or \
'pickthescore' in url or 'audioslideshow' in url :
url = None
return url
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(face=True):
for item in soup.findAll(face=True):
del item['face']
for tag in soup.findAll(name=['ul','li']):
tag.name = 'div'
return soup

View File

@ -0,0 +1,85 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
monitorcg.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class MonitorCG(BasicNewsRecipe):
title = 'Monitor online'
__author__ = 'Darko Miletic'
description = 'News from Montenegro'
publisher = 'MONITOR d.o.o. Podgorica'
category = 'news, politics, Montenegro'
oldest_article = 15
max_articles_per_feed = 150
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = _('Montenegrin')
lang ='sr-Latn-Me'
INDEX = 'http://www.monitorcg.com'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'ja-current-content'})]
remove_tags = [ dict(name=['object','link','embed'])
, dict(attrs={'class':['buttonheading','article-section']})]
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
def parse_index(self):
totalfeeds = []
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'ja-catslwi'})
if cover_item:
dt = cover_item['onclick'].partition("location.href=")[2]
curl = self.INDEX + dt.strip("'")
lfeeds = [(u'Svi clanci', curl)]
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
contitem = soup.find('div',attrs={'class':'article-content'})
if contitem:
img = contitem.find('img')
if img:
self.cover_url = self.INDEX + img['src']
for item in contitem.findAll('a'):
url = self.INDEX + item['href']
title = self.tag_to_string(item)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
republika.co.yu
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Republika(BasicNewsRecipe):
title = 'Republika'
__author__ = 'Darko Miletic'
description = 'Glasilo gradjanskog samooslobadjanja. Protiv stihije straha, mrznje i nasilja'
publisher = ' Zadruga Res Publica'
category = 'news, politics, Serbia'
language = _('Serbian')
lang = 'sr-Latn-RS'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1250'
use_embedded_content = False
INDEX = u'http://www.republika.co.yu/'
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .naslov{font-size: x-large; font-weight: bold} .autor{font-size: small; font-weight: bold} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(attrs={'class':'naslov'})
, dict(attrs={'class':'text1'})
]
remove_tags = [dict(name=['object','link','iframe','base','img'])]
feeds = [(u'Svi clanci', INDEX)]
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('a', attrs={'class':'naslovLink'}):
url = item['href']
title = self.tag_to_string(item)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds