diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe index c934cc4ac4..6572130389 100644 --- a/recipes/benchmark_pl.recipe +++ b/recipes/benchmark_pl.recipe @@ -12,14 +12,17 @@ class BenchmarkPl(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + extra_css = 'ul {list-style-type: none;}' no_stylesheets = True - remove_attributes = ['style'] + #remove_attributes = ['style'] preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] - keep_only_tags = [dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')] + + keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')] remove_tags_after = dict(id='article') remove_tags = [dict(name='div', attrs={'class':['comments', 'body', 'kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs = {'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] + INDEX = 'http://www.benchmark.pl' - feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), + feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] @@ -42,46 +45,16 @@ class BenchmarkPl(BasicNewsRecipe): for r in appendtag.findAll(attrs={'class':'changePage'}): r.extract() - - def image_article(self, soup, appendtag): - nexturl = soup.find('div', attrs={'class':'preview'}) - if nexturl: - nexturl = nexturl.find('a', attrs={'class':'move_next'}) - image = appendtag.find('div', attrs={'class':'preview'}).div['style'][16:] - image = self.INDEX + image[:image.find("')")] - appendtag.find(attrs={'class':'preview'}).name='img' - appendtag.find(attrs={'class':'preview'})['src']=image - appendtag.find('a', attrs={'class':'move_next'}).extract() - while nexturl: - nexturl = self.INDEX + nexturl['href'] - soup2 = self.index_to_soup(nexturl) - nexturl = soup2.find('a', attrs={'class':'move_next'}) - image = soup2.find('div', attrs={'class':'preview'}).div['style'][16:] - image = self.INDEX + image[:image.find("')")] - soup2.find(attrs={'class':'preview'}).name='img' - soup2.find(attrs={'class':'preview'})['src']=image - pagetext = soup2.find('div', attrs={'class':'gallery'}) - pagetext.find('div', attrs={'class':'title'}).extract() - pagetext.find('div', attrs={'class':'thumb'}).extract() - pagetext.find('div', attrs={'class':'panelOcenaObserwowane'}).extract() - if nexturl: - pagetext.find('a', attrs={'class':'move_next'}).extract() - pagetext.find('a', attrs={'class':'move_back'}).extract() - comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) - for comment in comments: - comment.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - - def preprocess_html(self, soup): - if soup.find('div', attrs={'class':'preview'}): - self.image_article(soup, soup.body) - else: - self.append_page(soup, soup.body) + self.append_page(soup, soup.body) for a in soup('a'): if a.has_key('href') and not a['href'].startswith('http'): a['href'] = self.INDEX + a['href'] for r in soup.findAll(attrs={'class':['comments', 'body']}): r.extract() + tag1 = soup.find(attrs={'class':'inlineGallery'}) + if tag1: + for tag in tag1.findAll('li'): + tag['style'] = 'float: left; margin-right: 10px;' + tag1.findNext('p')['style'] = 'clear: both;' return soup diff --git a/recipes/comics_com.recipe b/recipes/comics_com.recipe deleted file mode 100644 index 4c08dfea90..0000000000 --- a/recipes/comics_com.recipe +++ /dev/null @@ -1,224 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class Comics(BasicNewsRecipe): - title = 'Comics.com' - __author__ = 'Starson17' - description = 'Comics from comics.com. You should customize this recipe to fetch only the comics you are interested in' - language = 'en' - use_embedded_content= False - no_stylesheets = True - oldest_article = 24 - remove_javascript = True - cover_url = 'http://www.bsb.lib.tx.us/images/comics.com.gif' - recursions = 0 - max_articles_per_feed = 10 - num_comics_to_get = 7 - simultaneous_downloads = 1 - # delay = 3 - - keep_only_tags = [dict(name='a', attrs={'class':'STR_StripImage'}), - dict(name='div', attrs={'class':'STR_Date'}) - ] - - def parse_index(self): - feeds = [] - for title, url in [ - ("9 Chickweed Lane", "http://comics.com/9_chickweed_lane"), - ("Agnes", "http://comics.com/agnes"), - ("Alley Oop", "http://comics.com/alley_oop"), - ("Andy Capp", "http://comics.com/andy_capp"), - ("Arlo & Janis", "http://comics.com/arlo&janis"), - ("B.C.", "http://comics.com/bc"), - ("Ballard Street", "http://comics.com/ballard_street"), - # ("Ben", "http://comics.com/ben"), - # ("Betty", "http://comics.com/betty"), - # ("Big Nate", "http://comics.com/big_nate"), - # ("Brevity", "http://comics.com/brevity"), - # ("Candorville", "http://comics.com/candorville"), - # ("Cheap Thrills", "http://comics.com/cheap_thrills"), - # ("Committed", "http://comics.com/committed"), - # ("Cow & Boy", "http://comics.com/cow&boy"), - # ("Daddy's Home", "http://comics.com/daddys_home"), - # ("Dog eat Doug", "http://comics.com/dog_eat_doug"), - # ("Drabble", "http://comics.com/drabble"), - # ("F Minus", "http://comics.com/f_minus"), - # ("Family Tree", "http://comics.com/family_tree"), - # ("Farcus", "http://comics.com/farcus"), - # ("Fat Cats Classics", "http://comics.com/fat_cats_classics"), - # ("Ferd'nand", "http://comics.com/ferdnand"), - # ("Flight Deck", "http://comics.com/flight_deck"), - # ("Flo & Friends", "http://comics.com/flo&friends"), - # ("Fort Knox", "http://comics.com/fort_knox"), - # ("Frank & Ernest", "http://comics.com/frank&ernest"), - # ("Frazz", "http://comics.com/frazz"), - # ("Free Range", "http://comics.com/free_range"), - # ("Geech Classics", "http://comics.com/geech_classics"), - # ("Get Fuzzy", "http://comics.com/get_fuzzy"), - # ("Girls & Sports", "http://comics.com/girls&sports"), - # ("Graffiti", "http://comics.com/graffiti"), - # ("Grand Avenue", "http://comics.com/grand_avenue"), - # ("Heathcliff", "http://comics.com/heathcliff"), - # "Heathcliff, a street-smart and mischievous cat with many adventures." - # ("Herb and Jamaal", "http://comics.com/herb_and_jamaal"), - # ("Herman", "http://comics.com/herman"), - # ("Home and Away", "http://comics.com/home_and_away"), - # ("It's All About You", "http://comics.com/its_all_about_you"), - # ("Jane's World", "http://comics.com/janes_world"), - # ("Jump Start", "http://comics.com/jump_start"), - # ("Kit 'N' Carlyle", "http://comics.com/kit_n_carlyle"), - # ("Li'l Abner Classics", "http://comics.com/lil_abner_classics"), - # ("Liberty Meadows", "http://comics.com/liberty_meadows"), - # ("Little Dog Lost", "http://comics.com/little_dog_lost"), - # ("Lola", "http://comics.com/lola"), - # ("Luann", "http://comics.com/luann"), - # ("Marmaduke", "http://comics.com/marmaduke"), - # ("Meg! Classics", "http://comics.com/meg_classics"), - # ("Minimum Security", "http://comics.com/minimum_security"), - # ("Moderately Confused", "http://comics.com/moderately_confused"), - # ("Momma", "http://comics.com/momma"), - # ("Monty", "http://comics.com/monty"), - # ("Motley Classics", "http://comics.com/motley_classics"), - # ("Nancy", "http://comics.com/nancy"), - # ("Natural Selection", "http://comics.com/natural_selection"), - # ("Nest Heads", "http://comics.com/nest_heads"), - # ("Off The Mark", "http://comics.com/off_the_mark"), - # ("On a Claire Day", "http://comics.com/on_a_claire_day"), - # ("One Big Happy Classics", "http://comics.com/one_big_happy_classics"), - # ("Over the Hedge", "http://comics.com/over_the_hedge"), - # ("PC and Pixel", "http://comics.com/pc_and_pixel"), - # ("Peanuts", "http://comics.com/peanuts"), - # ("Pearls Before Swine", "http://comics.com/pearls_before_swine"), - # ("Pickles", "http://comics.com/pickles"), - # ("Prickly City", "http://comics.com/prickly_city"), - # ("Raising Duncan Classics", "http://comics.com/raising_duncan_classics"), - # ("Reality Check", "http://comics.com/reality_check"), - # ("Red & Rover", "http://comics.com/red&rover"), - # ("Rip Haywire", "http://comics.com/rip_haywire"), - # ("Ripley's Believe It or Not!", "http://comics.com/ripleys_believe_it_or_not"), - # ("Rose Is Rose", "http://comics.com/rose_is_rose"), - # ("Rubes", "http://comics.com/rubes"), - # ("Rudy Park", "http://comics.com/rudy_park"), - # ("Scary Gary", "http://comics.com/scary_gary"), - # ("Shirley and Son Classics", "http://comics.com/shirley_and_son_classics"), - # ("Soup To Nutz", "http://comics.com/soup_to_nutz"), - # ("Speed Bump", "http://comics.com/speed_bump"), - # ("Spot The Frog", "http://comics.com/spot_the_frog"), - # ("State of the Union", "http://comics.com/state_of_the_union"), - # ("Strange Brew", "http://comics.com/strange_brew"), - # ("Tarzan Classics", "http://comics.com/tarzan_classics"), - # ("That's Life", "http://comics.com/thats_life"), - # ("The Barn", "http://comics.com/the_barn"), - # ("The Born Loser", "http://comics.com/the_born_loser"), - # ("The Buckets", "http://comics.com/the_buckets"), - # ("The Dinette Set", "http://comics.com/the_dinette_set"), - # ("The Grizzwells", "http://comics.com/the_grizzwells"), - # ("The Humble Stumble", "http://comics.com/the_humble_stumble"), - # ("The Knight Life", "http://comics.com/the_knight_life"), - # ("The Meaning of Lila", "http://comics.com/the_meaning_of_lila"), - # ("The Other Coast", "http://comics.com/the_other_coast"), - # ("The Sunshine Club", "http://comics.com/the_sunshine_club"), - # ("Unstrange Phenomena", "http://comics.com/unstrange_phenomena"), - # ("Watch Your Head", "http://comics.com/watch_your_head"), - # ("Wizard of Id", "http://comics.com/wizard_of_id"), - # ("Working Daze", "http://comics.com/working_daze"), - # ("Working It Out", "http://comics.com/working_it_out"), - # ("Zack Hill", "http://comics.com/zack_hill"), - # ("(Th)ink", "http://comics.com/think"), - # "Tackling the political and social issues impacting communities of color." - # ("Adam Zyglis", "http://comics.com/adam_zyglis"), - # "Known for his excellent caricatures, as well as independent and incisive imagery. " - # ("Andy Singer", "http://comics.com/andy_singer"), - # ("Bill Day", "http://comics.com/bill_day"), - # "Powerful images on sensitive issues." - # ("Bill Schorr", "http://comics.com/bill_schorr"), - # ("Bob Englehart", "http://comics.com/bob_englehart"), - # ("Brian Fairrington", "http://comics.com/brian_fairrington"), - # ("Bruce Beattie", "http://comics.com/bruce_beattie"), - # ("Cam Cardow", "http://comics.com/cam_cardow"), - # ("Chip Bok", "http://comics.com/chip_bok"), - # ("Chris Britt", "http://comics.com/chris_britt"), - # ("Chuck Asay", "http://comics.com/chuck_asay"), - # ("Clay Bennett", "http://comics.com/clay_bennett"), - # ("Daryl Cagle", "http://comics.com/daryl_cagle"), - # ("David Fitzsimmons", "http://comics.com/david_fitzsimmons"), - # "David Fitzsimmons is a new editorial cartoons on comics.com. He is also a staff writer and editorial cartoonist for the Arizona Daily Star. " - # ("Drew Litton", "http://comics.com/drew_litton"), - # "Drew Litton is an artist who is probably best known for his sports cartoons. He received the National Cartoonist Society Sports Cartoon Award for 1993. " - # ("Ed Stein", "http://comics.com/ed_stein"), - # "Winner of the Fischetti Award in 2006 and the Scripps Howard National Journalism Award, 1999, Ed Stein has been the editorial cartoonist for the Rocky Mountain News since 1978. " - # ("Eric Allie", "http://comics.com/eric_allie"), - # "Eric Allie is an editorial cartoonist with the Pioneer Press and CNS News. " - # ("Gary Markstein", "http://comics.com/gary_markstein"), - # ("Gary McCoy", "http://comics.com/gary_mccoy"), - # "Gary McCoy is known for his editorial cartoons, humor and inane ramblings. He is a 2 time nominee for Best Magazine Cartoonist of the Year by the National Cartoonists Society. He resides in Belleville, IL. " - # ("Gary Varvel", "http://comics.com/gary_varvel"), - # ("Henry Payne", "http://comics.com/henry_payne"), - # ("JD Crowe", "http://comics.com/jd_crowe"), - # ("Jeff Parker", "http://comics.com/jeff_parker"), - # ("Jeff Stahler", "http://comics.com/jeff_stahler"), - # ("Jerry Holbert", "http://comics.com/jerry_holbert"), - # ("John Cole", "http://comics.com/john_cole"), - # ("John Darkow", "http://comics.com/john_darkow"), - # "John Darkow is a contributing editorial cartoonist for the Humor Times as well as editoiral cartoonist for the Columbia Daily Tribune, Missouri" - # ("John Sherffius", "http://comics.com/john_sherffius"), - # ("Larry Wright", "http://comics.com/larry_wright"), - # ("Lisa Benson", "http://comics.com/lisa_benson"), - # ("Marshall Ramsey", "http://comics.com/marshall_ramsey"), - # ("Matt Bors", "http://comics.com/matt_bors"), - # ("Michael Ramirez", "http://comics.com/michael_ramirez"), - # ("Mike Keefe", "http://comics.com/mike_keefe"), - # ("Mike Luckovich", "http://comics.com/mike_luckovich"), - # ("MIke Thompson", "http://comics.com/mike_thompson"), - # ("Monte Wolverton", "http://comics.com/monte_wolverton"), - # "Unique mix of perspectives" - # ("Mr. Fish", "http://comics.com/mr_fish"), - # "Side effects may include swelling" - # ("Nate Beeler", "http://comics.com/nate_beeler"), - # "Middle America meets the Beltway." - # ("Nick Anderson", "http://comics.com/nick_anderson"), - # ("Pat Bagley", "http://comics.com/pat_bagley"), - # "Unfair and Totally Unbalanced." - # ("Paul Szep", "http://comics.com/paul_szep"), - # ("RJ Matson", "http://comics.com/rj_matson"), - # "Power cartoons from NYC and Capitol Hill" - # ("Rob Rogers", "http://comics.com/rob_rogers"), - # "Humorous slant on current events" - # ("Robert Ariail", "http://comics.com/robert_ariail"), - # "Clever and unpredictable" - # ("Scott Stantis", "http://comics.com/scott_stantis"), - # ("Signe Wilkinson", "http://comics.com/signe_wilkinson"), - # ("Steve Benson", "http://comics.com/steve_benson"), - # ("Steve Breen", "http://comics.com/steve_breen"), - # ("Steve Kelley", "http://comics.com/steve_kelley"), - # ("Steve Sack", "http://comics.com/steve_sack"), - ]: - articles = self.make_links(url) - if articles: - feeds.append((title, articles)) - return feeds - - def make_links(self, url): - soup = self.index_to_soup(url) - # print 'soup: ', soup - title = '' - current_articles = [] - pages = range(1, self.num_comics_to_get+1) - for page in pages: - page_url = url + '/?Page=' + str(page) - soup = self.index_to_soup(page_url) - if soup: - strip_tag = soup.find('a', attrs={'class': 'STR_StripImage'}) - if strip_tag: - print 'strip_tag: ', strip_tag - title = strip_tag['title'] - print 'title: ', title - current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''}) - current_articles.reverse() - return current_articles - - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} - h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe index 6d4e2a179f..8bf1f55124 100644 --- a/recipes/computerworld_pl.recipe +++ b/recipes/computerworld_pl.recipe @@ -16,7 +16,7 @@ class Computerworld_pl(BasicNewsRecipe): preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), (re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''),] keep_only_tags = [dict(id=['szpaltaL', 's2011'])] remove_tags_after = dict(name='div', attrs={'class':'tresc'}) - remove_tags = [dict(attrs={'class':['nnav', 'rMobi', 'tagi', 'rec']}),] + remove_tags = [dict(attrs={'class':['nnav', 'rMobi', 'tagi', 'rec']}), dict(name='a', attrs={'target':'_blank'})] feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] def skip_ad_pages(self, soup): diff --git a/recipes/conowego_pl.recipe b/recipes/conowego_pl.recipe index 7e6549e713..982497f453 100644 --- a/recipes/conowego_pl.recipe +++ b/recipes/conowego_pl.recipe @@ -15,6 +15,7 @@ class CoNowegoPl(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [dict(name='div', attrs={'class':'news_list single_view'})] remove_tags = [dict(name='div', attrs={'class':['ni_bottom', 'ni_rank', 'ni_date']})] feeds = [(u'Aktualno\u015bci', u'http://www.conowego.pl/rss/aktualnosci-5/?type=100'), (u'Gaming', u'http://www.conowego.pl/rss/gaming-6/?type=100'), (u'Porady', u'http://www.conowego.pl/rss/porady-3/?type=100'), (u'Testy', u'http://www.conowego.pl/rss/testy-2/?type=100')] diff --git a/recipes/di.recipe b/recipes/di.recipe index dad0fdd648..b4903f9614 100644 --- a/recipes/di.recipe +++ b/recipes/di.recipe @@ -1,4 +1,5 @@ #!/usr/bin/env python +# vim:fileencoding=UTF-8 __license__ = 'GPL v3' __author__ = 'Mori' @@ -14,7 +15,7 @@ class DziennikInternautowRecipe(BasicNewsRecipe): __author__ = 'Mori' language = 'pl' - title = u'Dziennik Internautow' + title = u'Dziennik Internautów' publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.' description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.' diff --git a/recipes/dziennik_lodzki.recipe b/recipes/dziennik_lodzki.recipe index 93a86fdaa2..ecde189e64 100644 --- a/recipes/dziennik_lodzki.recipe +++ b/recipes/dziennik_lodzki.recipe @@ -16,7 +16,7 @@ class DziennikLodzki(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} #preprocess_regexps = [(re.compile(ur'Czytaj także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur',Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TEŻ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ WIĘCEJ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TAKŻE:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: ''), (re.compile(ur'Nasze serwisy:.*', re.DOTALL), lambda match: '') ] remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) - remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/piano'})] feeds = [(u'Na sygnale', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_nasygnale.xml?201302'), (u'\u0141\xf3d\u017a', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_lodz.xml?201302'), (u'Opinie', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_opinie.xml?201302'), (u'Pieni\u0105dze', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533763/index.rss?201302'), (u'Kultura', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533762/index.rss?201302'), (u'Sport', u'http://dzienniklodzki.feedsportal.com/c/32980/f/533761/index.rss?201302'), (u'Akcje', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_akcje.xml?201302'), (u'M\xf3j Reporter', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_mojreporter.xml?201302'), (u'Studni\xf3wki', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_studniowki.xml?201302'), (u'Kraj', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_kraj.xml?201302'), (u'Zdrowie', u'http://www.dzienniklodzki.pl/rss/dzienniklodzki_zdrowie.xml?201302')] diff --git a/recipes/dziennik_zachodni.recipe b/recipes/dziennik_zachodni.recipe index 126c876937..3fc6512bef 100644 --- a/recipes/dziennik_zachodni.recipe +++ b/recipes/dziennik_zachodni.recipe @@ -16,7 +16,7 @@ class DziennikZachodni(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} #preprocess_regexps = [(re.compile(ur'Czytaj także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur',Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TEŻ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ WIĘCEJ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TAKŻE:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: ''), (re.compile(ur'Nasze serwisy:.*', re.DOTALL), lambda match: '') ] remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) - remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}), dict(attrs={'href':'http://www.dziennikzachodni.pl/piano'})] + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}), dict(attrs={'href':'http://www.dziennikzachodni.pl/piano'}), dict(name='aside')] feeds = [(u'Wszystkie', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533764/index.rss?201302'), (u'Wiadomo\u015bci', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533765/index.rss?201302'), (u'Regiony', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_regiony.xml?201302'), (u'Opinie', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_regiony.xml?201302'), (u'Blogi', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_blogi.xml?201302'), (u'Serwisy', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_serwisy.xml?201302'), (u'Sport', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533766/index.rss?201302'), (u'M\xf3j Reporter', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_mojreporter.xml?201302'), (u'Na narty', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_nanarty.xml?201302'), (u'Drogi', u'http://www.dziennikzachodni.pl/rss/dziennikzachodni_drogi.xml?201302'), (u'Pieni\u0105dze', u'http://dziennikzachodni.feedsportal.com/c/32980/f/533768/index.rss?201302')] diff --git a/recipes/echo_dnia.recipe b/recipes/echo_dnia.recipe index def87ce0e1..7dc913d062 100644 --- a/recipes/echo_dnia.recipe +++ b/recipes/echo_dnia.recipe @@ -16,6 +16,7 @@ class EchoDnia(BasicNewsRecipe): max_articles_per_feed = 100 remove_empty_feeds = True no_stylesheets = True + use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} preprocess_regexps = [(re.compile(ur'Czytaj:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?', re.DOTALL|re.IGNORECASE), lambda match: ''), diff --git a/recipes/ekundelek_pl.recipe b/recipes/ekundelek_pl.recipe index ebc5d39bbd..bf0963092f 100644 --- a/recipes/ekundelek_pl.recipe +++ b/recipes/ekundelek_pl.recipe @@ -12,7 +12,7 @@ class swiatczytnikow(BasicNewsRecipe): __author__ = u'Artur Stachecki' oldest_article = 7 max_articles_per_feed = 100 - + remove_empty_feeds = True remove_tags = [dict(name = 'div', attrs = {'class' : 'feedflare'})] feeds = [(u'Wpisy', u'http://feeds.feedburner.com/Ekundelekpl?format=xml')] diff --git a/recipes/emuzica_pl.recipe b/recipes/emuzica_pl.recipe index 0b3b207c5e..18a7ab2726 100644 --- a/recipes/emuzica_pl.recipe +++ b/recipes/emuzica_pl.recipe @@ -11,6 +11,7 @@ class eMuzyka(BasicNewsRecipe): cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg' no_stylesheets = True oldest_article = 7 + remove_empty_feeds = True max_articles_per_feed = 100 remove_attributes = ['style'] keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})] diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 310077cdec..880aea5bc1 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -9,6 +9,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe): description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.' category = 'newspaper' publication_type = 'newspaper' + #encoding = 'iso-8859-2' masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' INDEX = 'http://wyborcza.pl' remove_empty_feeds = True @@ -16,6 +17,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe): max_articles_per_feed = 100 remove_javascript = True no_stylesheets = True + use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} remove_tags_before = dict(id='k0') remove_tags_after = dict(id='banP4') @@ -24,7 +26,19 @@ class Gazeta_Wyborcza(BasicNewsRecipe): (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), - (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') + (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), + (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), + (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), + (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), + (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), + (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), + (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), + (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), + (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), + (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), + (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), + (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), + (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') ] def skip_ad_pages(self, soup): diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index 6d3528b0bc..37c129aaa1 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -31,6 +31,14 @@ class Gildia(BasicNewsRecipe): for link in content.findAll(name='a'): if 'fragment' in link['href']: return self.index_to_soup(link['href'], raw=True) + if 'relacj' in soup.title.string.lower(): + for link in content.findAll(name='a'): + if 'relacj' in link['href']: + return self.index_to_soup(link['href'], raw=True) + if 'wywiad' in soup.title.string.lower(): + for link in content.findAll(name='a'): + if 'wywiad' in link['href']: + return self.index_to_soup(link['href'], raw=True) def preprocess_html(self, soup): diff --git a/recipes/glos_wielkopolski.recipe b/recipes/glos_wielkopolski.recipe index d7706c4173..1b7f3f7817 100644 --- a/recipes/glos_wielkopolski.recipe +++ b/recipes/glos_wielkopolski.recipe @@ -16,7 +16,7 @@ class GlosWielkopolski(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} #preprocess_regexps = [(re.compile(ur'Czytaj także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur',Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TEŻ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ WIĘCEJ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TAKŻE:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: ''), (re.compile(ur'Nasze serwisy:.*', re.DOTALL), lambda match: '') ] remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}) - remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})] + remove_tags=[dict(id='mat-podobne'), dict(name='a', attrs={'class':'czytajDalej'}), dict(attrs={'src':'http://nm.dz.com.pl/dz.png'}), dict(name='a', attrs={'href', 'http://www.gloswielkopolski.pl/newsletter/'})] feeds = [(u'Wszystkie', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533779/index.rss?201302'), (u'Wiadomo\u015bci', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533780/index.rss?201302'), (u'Sport', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533781/index.rss?201302'), (u'Kultura', u'http://gloswielkopolski.feedsportal.com/c/32980/f/533782/index.rss?201302'), (u'Porady', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_porady.xml?201302'), (u'Blogi', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_blogi.xml?201302'), (u'Nasze akcje', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_naszeakcje.xml?201302'), (u'Opinie', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_opinie.xml?201302'), (u'Magazyn', u'http://www.gloswielkopolski.pl/rss/gloswielkopolski_magazyn.xml?201302')] diff --git a/recipes/go_comics.recipe b/recipes/go_comics.recipe index 109d3568ea..c3d44cb77a 100644 --- a/recipes/go_comics.recipe +++ b/recipes/go_comics.recipe @@ -1,229 +1,443 @@ +__license__ = 'GPL v3' +__copyright__ = 'Copyright 2010 Starson17' +''' +www.gocomics.com +''' from calibre.web.feeds.news import BasicNewsRecipe +import re - -class Comics(BasicNewsRecipe): - title = 'Comics.com' +class GoComics(BasicNewsRecipe): + title = 'Go Comics' __author__ = 'Starson17' - description = 'Comics from comics.com. You should customize this recipe to fetch only the comics you are interested in' + __version__ = '1.06' + __date__ = '07 June 2011' + description = u'200+ Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.' + category = 'news, comics' language = 'en' use_embedded_content= False no_stylesheets = True - oldest_article = 24 remove_javascript = True - cover_url = 'http://www.bsb.lib.tx.us/images/comics.com.gif' - recursions = 0 - max_articles_per_feed = 10 - num_comics_to_get = 7 - simultaneous_downloads = 1 - # delay = 3 + remove_attributes = ['style'] - keep_only_tags = [dict(name='h1'), - dict(name='p', attrs={'class':'feature_item'}) + ####### USER PREFERENCES - COMICS, IMAGE SIZE AND NUMBER OF COMICS TO RETRIEVE ######## + # num_comics_to_get - I've tried up to 99 on Calvin&Hobbes + num_comics_to_get = 1 + # comic_size 300 is small, 600 is medium, 900 is large, 1500 is extra-large + comic_size = 900 + # CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS + # Please do not overload their servers by selecting all comics and 1000 strips from each! + + conversion_options = {'linearize_tables' : True + , 'comment' : description + , 'tags' : category + , 'language' : language + } + + keep_only_tags = [dict(name='div', attrs={'class':['feature','banner']}), ] + remove_tags = [dict(name='a', attrs={'class':['beginning','prev','cal','next','newest']}), + dict(name='div', attrs={'class':['tag-wrapper']}), + dict(name='a', attrs={'href':re.compile(r'.*mutable_[0-9]+', re.IGNORECASE)}), + dict(name='img', attrs={'src':re.compile(r'.*mutable_[0-9]+', re.IGNORECASE)}), + dict(name='ul', attrs={'class':['share-nav','feature-nav']}), + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.addheaders = [('Referer','http://www.gocomics.com/')] + return br + def parse_index(self): feeds = [] for title, url in [ - ("9 Chickweed Lane", "http://gocomics.com/9_chickweed_lane"), - ("Agnes", "http://gocomics.com/agnes"), - ("Alley Oop", "http://gocomics.com/alley_oop"), - ("Andy Capp", "http://gocomics.com/andy_capp"), - ("Arlo & Janis", "http://gocomics.com/arlo&janis"), - ("B.C.", "http://gocomics.com/bc"), - ("Ballard Street", "http://gocomics.com/ballard_street"), - # ("Ben", "http://comics.com/ben"), - # ("Betty", "http://comics.com/betty"), - # ("Big Nate", "http://comics.com/big_nate"), - # ("Brevity", "http://comics.com/brevity"), - # ("Candorville", "http://comics.com/candorville"), - # ("Cheap Thrills", "http://comics.com/cheap_thrills"), - # ("Committed", "http://comics.com/committed"), - # ("Cow & Boy", "http://comics.com/cow&boy"), - # ("Daddy's Home", "http://comics.com/daddys_home"), - # ("Dog eat Doug", "http://comics.com/dog_eat_doug"), - # ("Drabble", "http://comics.com/drabble"), - # ("F Minus", "http://comics.com/f_minus"), - # ("Family Tree", "http://comics.com/family_tree"), - # ("Farcus", "http://comics.com/farcus"), - # ("Fat Cats Classics", "http://comics.com/fat_cats_classics"), - # ("Ferd'nand", "http://comics.com/ferdnand"), - # ("Flight Deck", "http://comics.com/flight_deck"), - # ("Flo & Friends", "http://comics.com/flo&friends"), - # ("Fort Knox", "http://comics.com/fort_knox"), - # ("Frank & Ernest", "http://comics.com/frank&ernest"), - # ("Frazz", "http://comics.com/frazz"), - # ("Free Range", "http://comics.com/free_range"), - # ("Geech Classics", "http://comics.com/geech_classics"), - # ("Get Fuzzy", "http://comics.com/get_fuzzy"), - # ("Girls & Sports", "http://comics.com/girls&sports"), - # ("Graffiti", "http://comics.com/graffiti"), - # ("Grand Avenue", "http://comics.com/grand_avenue"), - # ("Heathcliff", "http://comics.com/heathcliff"), - # "Heathcliff, a street-smart and mischievous cat with many adventures." - # ("Herb and Jamaal", "http://comics.com/herb_and_jamaal"), - # ("Herman", "http://comics.com/herman"), - # ("Home and Away", "http://comics.com/home_and_away"), - # ("It's All About You", "http://comics.com/its_all_about_you"), - # ("Jane's World", "http://comics.com/janes_world"), - # ("Jump Start", "http://comics.com/jump_start"), - # ("Kit 'N' Carlyle", "http://comics.com/kit_n_carlyle"), - # ("Li'l Abner Classics", "http://comics.com/lil_abner_classics"), - # ("Liberty Meadows", "http://comics.com/liberty_meadows"), - # ("Little Dog Lost", "http://comics.com/little_dog_lost"), - # ("Lola", "http://comics.com/lola"), - # ("Luann", "http://comics.com/luann"), - # ("Marmaduke", "http://comics.com/marmaduke"), - # ("Meg! Classics", "http://comics.com/meg_classics"), - # ("Minimum Security", "http://comics.com/minimum_security"), - # ("Moderately Confused", "http://comics.com/moderately_confused"), - # ("Momma", "http://comics.com/momma"), - # ("Monty", "http://comics.com/monty"), - # ("Motley Classics", "http://comics.com/motley_classics"), - # ("Nancy", "http://comics.com/nancy"), - # ("Natural Selection", "http://comics.com/natural_selection"), - # ("Nest Heads", "http://comics.com/nest_heads"), - # ("Off The Mark", "http://comics.com/off_the_mark"), - # ("On a Claire Day", "http://comics.com/on_a_claire_day"), - # ("One Big Happy Classics", "http://comics.com/one_big_happy_classics"), - # ("Over the Hedge", "http://comics.com/over_the_hedge"), - # ("PC and Pixel", "http://comics.com/pc_and_pixel"), - # ("Peanuts", "http://comics.com/peanuts"), - # ("Pearls Before Swine", "http://comics.com/pearls_before_swine"), - # ("Pickles", "http://comics.com/pickles"), - # ("Prickly City", "http://comics.com/prickly_city"), - # ("Raising Duncan Classics", "http://comics.com/raising_duncan_classics"), - # ("Reality Check", "http://comics.com/reality_check"), - # ("Red & Rover", "http://comics.com/red&rover"), - # ("Rip Haywire", "http://comics.com/rip_haywire"), - # ("Ripley's Believe It or Not!", "http://comics.com/ripleys_believe_it_or_not"), - # ("Rose Is Rose", "http://comics.com/rose_is_rose"), - # ("Rubes", "http://comics.com/rubes"), - # ("Rudy Park", "http://comics.com/rudy_park"), - # ("Scary Gary", "http://comics.com/scary_gary"), - # ("Shirley and Son Classics", "http://comics.com/shirley_and_son_classics"), - # ("Soup To Nutz", "http://comics.com/soup_to_nutz"), - # ("Speed Bump", "http://comics.com/speed_bump"), - # ("Spot The Frog", "http://comics.com/spot_the_frog"), - # ("State of the Union", "http://comics.com/state_of_the_union"), - # ("Strange Brew", "http://comics.com/strange_brew"), - # ("Tarzan Classics", "http://comics.com/tarzan_classics"), - # ("That's Life", "http://comics.com/thats_life"), - # ("The Barn", "http://comics.com/the_barn"), - # ("The Born Loser", "http://comics.com/the_born_loser"), - # ("The Buckets", "http://comics.com/the_buckets"), - # ("The Dinette Set", "http://comics.com/the_dinette_set"), - # ("The Grizzwells", "http://comics.com/the_grizzwells"), - # ("The Humble Stumble", "http://comics.com/the_humble_stumble"), - # ("The Knight Life", "http://comics.com/the_knight_life"), - # ("The Meaning of Lila", "http://comics.com/the_meaning_of_lila"), - # ("The Other Coast", "http://comics.com/the_other_coast"), - # ("The Sunshine Club", "http://comics.com/the_sunshine_club"), - # ("Unstrange Phenomena", "http://comics.com/unstrange_phenomena"), - # ("Watch Your Head", "http://comics.com/watch_your_head"), - # ("Wizard of Id", "http://comics.com/wizard_of_id"), - # ("Working Daze", "http://comics.com/working_daze"), - # ("Working It Out", "http://comics.com/working_it_out"), - # ("Zack Hill", "http://comics.com/zack_hill"), - # ("(Th)ink", "http://comics.com/think"), - # "Tackling the political and social issues impacting communities of color." - # ("Adam Zyglis", "http://comics.com/adam_zyglis"), - # "Known for his excellent caricatures, as well as independent and incisive imagery. " - # ("Andy Singer", "http://comics.com/andy_singer"), - # ("Bill Day", "http://comics.com/bill_day"), - # "Powerful images on sensitive issues." - # ("Bill Schorr", "http://comics.com/bill_schorr"), - # ("Bob Englehart", "http://comics.com/bob_englehart"), - # ("Brian Fairrington", "http://comics.com/brian_fairrington"), - # ("Bruce Beattie", "http://comics.com/bruce_beattie"), - # ("Cam Cardow", "http://comics.com/cam_cardow"), - # ("Chip Bok", "http://comics.com/chip_bok"), - # ("Chris Britt", "http://comics.com/chris_britt"), - # ("Chuck Asay", "http://comics.com/chuck_asay"), - # ("Clay Bennett", "http://comics.com/clay_bennett"), - # ("Daryl Cagle", "http://comics.com/daryl_cagle"), - # ("David Fitzsimmons", "http://comics.com/david_fitzsimmons"), - # "David Fitzsimmons is a new editorial cartoons on comics.com. He is also a staff writer and editorial cartoonist for the Arizona Daily Star. " - # ("Drew Litton", "http://comics.com/drew_litton"), - # "Drew Litton is an artist who is probably best known for his sports cartoons. He received the National Cartoonist Society Sports Cartoon Award for 1993. " - # ("Ed Stein", "http://comics.com/ed_stein"), - # "Winner of the Fischetti Award in 2006 and the Scripps Howard National Journalism Award, 1999, Ed Stein has been the editorial cartoonist for the Rocky Mountain News since 1978. " - # ("Eric Allie", "http://comics.com/eric_allie"), - # "Eric Allie is an editorial cartoonist with the Pioneer Press and CNS News. " - # ("Gary Markstein", "http://comics.com/gary_markstein"), - # ("Gary McCoy", "http://comics.com/gary_mccoy"), - # "Gary McCoy is known for his editorial cartoons, humor and inane ramblings. He is a 2 time nominee for Best Magazine Cartoonist of the Year by the National Cartoonists Society. He resides in Belleville, IL. " - # ("Gary Varvel", "http://comics.com/gary_varvel"), - # ("Henry Payne", "http://comics.com/henry_payne"), - # ("JD Crowe", "http://comics.com/jd_crowe"), - # ("Jeff Parker", "http://comics.com/jeff_parker"), - # ("Jeff Stahler", "http://comics.com/jeff_stahler"), - # ("Jerry Holbert", "http://comics.com/jerry_holbert"), - # ("John Cole", "http://comics.com/john_cole"), - # ("John Darkow", "http://comics.com/john_darkow"), - # "John Darkow is a contributing editorial cartoonist for the Humor Times as well as editoiral cartoonist for the Columbia Daily Tribune, Missouri" - # ("John Sherffius", "http://comics.com/john_sherffius"), - # ("Larry Wright", "http://comics.com/larry_wright"), - # ("Lisa Benson", "http://comics.com/lisa_benson"), - # ("Marshall Ramsey", "http://comics.com/marshall_ramsey"), - # ("Matt Bors", "http://comics.com/matt_bors"), - # ("Michael Ramirez", "http://comics.com/michael_ramirez"), - # ("Mike Keefe", "http://comics.com/mike_keefe"), - # ("Mike Luckovich", "http://comics.com/mike_luckovich"), - # ("MIke Thompson", "http://comics.com/mike_thompson"), - # ("Monte Wolverton", "http://comics.com/monte_wolverton"), - # "Unique mix of perspectives" - # ("Mr. Fish", "http://comics.com/mr_fish"), - # "Side effects may include swelling" - # ("Nate Beeler", "http://comics.com/nate_beeler"), - # "Middle America meets the Beltway." - # ("Nick Anderson", "http://comics.com/nick_anderson"), - # ("Pat Bagley", "http://comics.com/pat_bagley"), - # "Unfair and Totally Unbalanced." - # ("Paul Szep", "http://comics.com/paul_szep"), - # ("RJ Matson", "http://comics.com/rj_matson"), - # "Power cartoons from NYC and Capitol Hill" - # ("Rob Rogers", "http://comics.com/rob_rogers"), - # "Humorous slant on current events" - # ("Robert Ariail", "http://comics.com/robert_ariail"), - # "Clever and unpredictable" - # ("Scott Stantis", "http://comics.com/scott_stantis"), - # ("Signe Wilkinson", "http://comics.com/signe_wilkinson"), - # ("Steve Benson", "http://comics.com/steve_benson"), - # ("Steve Breen", "http://comics.com/steve_breen"), - # ("Steve Kelley", "http://comics.com/steve_kelley"), - # ("Steve Sack", "http://comics.com/steve_sack"), - ]: + #(u"2 Cows and a Chicken", u"http://www.gocomics.com/2cowsandachicken"), + #(u"9 Chickweed Lane", u"http://www.gocomics.com/9chickweedlane"), + #(u"Adam At Home", u"http://www.gocomics.com/adamathome"), + #(u"Agnes", u"http://www.gocomics.com/agnes"), + #(u"Alley Oop", u"http://www.gocomics.com/alleyoop"), + #(u"Andy Capp", u"http://www.gocomics.com/andycapp"), + (u"Animal Crackers", u"http://www.gocomics.com/animalcrackers"), + #(u"Annie", u"http://www.gocomics.com/annie"), + #(u"Arlo & Janis", u"http://www.gocomics.com/arloandjanis"), + #(u"Ask Shagg", u"http://www.gocomics.com/askshagg"), + (u"B.C.", u"http://www.gocomics.com/bc"), + #(u"Back in the Day", u"http://www.gocomics.com/backintheday"), + #(u"Bad Reporter", u"http://www.gocomics.com/badreporter"), + (u"Baldo", u"http://www.gocomics.com/baldo"), + #(u"Ballard Street", u"http://www.gocomics.com/ballardstreet"), + #(u"Barkeater Lake", u"http://www.gocomics.com/barkeaterlake"), + #(u"Basic Instructions", u"http://www.gocomics.com/basicinstructions"), + #(u"Ben", u"http://www.gocomics.com/ben"), + #(u"Betty", u"http://www.gocomics.com/betty"), + #(u"Bewley", u"http://www.gocomics.com/bewley"), + #(u"Big Nate", u"http://www.gocomics.com/bignate"), + #(u"Big Top", u"http://www.gocomics.com/bigtop"), + #(u"Biographic", u"http://www.gocomics.com/biographic"), + #(u"Birdbrains", u"http://www.gocomics.com/birdbrains"), + #(u"Bleeker: The Rechargeable Dog", u"http://www.gocomics.com/bleeker"), + #(u"Bliss", u"http://www.gocomics.com/bliss"), + #(u"Bloom County", u"http://www.gocomics.com/bloomcounty"), + #(u"Bo Nanas", u"http://www.gocomics.com/bonanas"), + #(u"Bob the Squirrel", u"http://www.gocomics.com/bobthesquirrel"), + #(u"Boomerangs", u"http://www.gocomics.com/boomerangs"), + #(u"Bottomliners", u"http://www.gocomics.com/bottomliners"), + (u"Bound and Gagged", u"http://www.gocomics.com/boundandgagged"), + #(u"Brainwaves", u"http://www.gocomics.com/brainwaves"), + #(u"Brenda Starr", u"http://www.gocomics.com/brendastarr"), + #(u"Brevity", u"http://www.gocomics.com/brevity"), + #(u"Brewster Rockit", u"http://www.gocomics.com/brewsterrockit"), + (u"Broom Hilda", u"http://www.gocomics.com/broomhilda"), + (u"Calvin and Hobbes", u"http://www.gocomics.com/calvinandhobbes"), + #(u"Candorville", u"http://www.gocomics.com/candorville"), + #(u"Cathy", u"http://www.gocomics.com/cathy"), + #(u"C'est la Vie", u"http://www.gocomics.com/cestlavie"), + #(u"Cheap Thrills", u"http://www.gocomics.com/cheapthrills"), + #(u"Chuckle Bros", u"http://www.gocomics.com/chucklebros"), + #(u"Citizen Dog", u"http://www.gocomics.com/citizendog"), + #(u"Cleats", u"http://www.gocomics.com/cleats"), + #(u"Close to Home", u"http://www.gocomics.com/closetohome"), + #(u"Committed", u"http://www.gocomics.com/committed"), + #(u"Compu-toon", u"http://www.gocomics.com/compu-toon"), + #(u"Cornered", u"http://www.gocomics.com/cornered"), + #(u"Cow & Boy", u"http://www.gocomics.com/cow&boy"), + #(u"Cul de Sac", u"http://www.gocomics.com/culdesac"), + #(u"Daddy's Home", u"http://www.gocomics.com/daddyshome"), + #(u"Deep Cover", u"http://www.gocomics.com/deepcover"), + #(u"Dick Tracy", u"http://www.gocomics.com/dicktracy"), + #(u"Dog Eat Doug", u"http://www.gocomics.com/dogeatdoug"), + #(u"Domestic Abuse", u"http://www.gocomics.com/domesticabuse"), + #(u"Doodles", u"http://www.gocomics.com/doodles"), + #(u"Doonesbury", u"http://www.gocomics.com/doonesbury"), + #(u"Drabble", u"http://www.gocomics.com/drabble"), + #(u"Eek!", u"http://www.gocomics.com/eek"), + #(u"F Minus", u"http://www.gocomics.com/fminus"), + #(u"Family Tree", u"http://www.gocomics.com/familytree"), + #(u"Farcus", u"http://www.gocomics.com/farcus"), + #(u"Fat Cats Classics", u"http://www.gocomics.com/fatcatsclassics"), + #(u"Ferd'nand", u"http://www.gocomics.com/ferdnand"), + #(u"Flight Deck", u"http://www.gocomics.com/flightdeck"), + #(u"Flo and Friends", u"http://www.gocomics.com/floandfriends"), + (u"For Better or For Worse", u"http://www.gocomics.com/forbetterorforworse"), + #(u"For Heaven's Sake", u"http://www.gocomics.com/forheavenssake"), + #(u"Fort Knox", u"http://www.gocomics.com/fortknox"), + #(u"FoxTrot Classics", u"http://www.gocomics.com/foxtrotclassics"), + #(u"FoxTrot", u"http://www.gocomics.com/foxtrot"), + (u"Frank & Ernest", u"http://www.gocomics.com/frankandernest"), + #(u"Frazz", u"http://www.gocomics.com/frazz"), + #(u"Fred Basset", u"http://www.gocomics.com/fredbasset"), + #(u"Free Range", u"http://www.gocomics.com/freerange"), + #(u"Frog Applause", u"http://www.gocomics.com/frogapplause"), + #(u"Garfield Minus Garfield", u"http://www.gocomics.com/garfieldminusgarfield"), + (u"Garfield", u"http://www.gocomics.com/garfield"), + #(u"Gasoline Alley", u"http://www.gocomics.com/gasolinealley"), + #(u"Geech Classics", u"http://www.gocomics.com/geechclassics"), + (u"Get Fuzzy", u"http://www.gocomics.com/getfuzzy"), + #(u"Gil Thorp", u"http://www.gocomics.com/gilthorp"), + #(u"Ginger Meggs", u"http://www.gocomics.com/gingermeggs"), + #(u"Girls & Sports", u"http://www.gocomics.com/girlsandsports"), + #(u"Graffiti", u"http://www.gocomics.com/graffiti"), + #(u"Grand Avenue", u"http://www.gocomics.com/grandavenue"), + #(u"Haiku Ewe", u"http://www.gocomics.com/haikuewe"), + #(u"Heart of the City", u"http://www.gocomics.com/heartofthecity"), + #(u"Herb and Jamaal", u"http://www.gocomics.com/herbandjamaal"), + #(u"Home and Away", u"http://www.gocomics.com/homeandaway"), + #(u"Housebroken", u"http://www.gocomics.com/housebroken"), + #(u"Hubert and Abby", u"http://www.gocomics.com/hubertandabby"), + #(u"Imagine This", u"http://www.gocomics.com/imaginethis"), + #(u"In the Bleachers", u"http://www.gocomics.com/inthebleachers"), + #(u"In the Sticks", u"http://www.gocomics.com/inthesticks"), + #(u"Ink Pen", u"http://www.gocomics.com/inkpen"), + #(u"It's All About You", u"http://www.gocomics.com/itsallaboutyou"), + #(u"Jane's World", u"http://www.gocomics.com/janesworld"), + #(u"Joe Vanilla", u"http://www.gocomics.com/joevanilla"), + #(u"Jump Start", u"http://www.gocomics.com/jumpstart"), + #(u"Kit 'N' Carlyle", u"http://www.gocomics.com/kitandcarlyle"), + #(u"La Cucaracha", u"http://www.gocomics.com/lacucaracha"), + #(u"Last Kiss", u"http://www.gocomics.com/lastkiss"), + #(u"Legend of Bill", u"http://www.gocomics.com/legendofbill"), + #(u"Liberty Meadows", u"http://www.gocomics.com/libertymeadows"), + #(u"Li'l Abner Classics", u"http://www.gocomics.com/lilabnerclassics"), + #(u"Lio", u"http://www.gocomics.com/lio"), + #(u"Little Dog Lost", u"http://www.gocomics.com/littledoglost"), + #(u"Little Otto", u"http://www.gocomics.com/littleotto"), + #(u"Lola", u"http://www.gocomics.com/lola"), + #(u"Love Is...", u"http://www.gocomics.com/loveis"), + (u"Luann", u"http://www.gocomics.com/luann"), + #(u"Maintaining", u"http://www.gocomics.com/maintaining"), + #(u"Meg! Classics", u"http://www.gocomics.com/megclassics"), + #(u"Middle-Aged White Guy", u"http://www.gocomics.com/middleagedwhiteguy"), + #(u"Minimum Security", u"http://www.gocomics.com/minimumsecurity"), + #(u"Moderately Confused", u"http://www.gocomics.com/moderatelyconfused"), + (u"Momma", u"http://www.gocomics.com/momma"), + #(u"Monty", u"http://www.gocomics.com/monty"), + #(u"Motley Classics", u"http://www.gocomics.com/motleyclassics"), + #(u"Mutt & Jeff", u"http://www.gocomics.com/muttandjeff"), + #(u"Mythtickle", u"http://www.gocomics.com/mythtickle"), + #(u"Nancy", u"http://www.gocomics.com/nancy"), + #(u"Natural Selection", u"http://www.gocomics.com/naturalselection"), + #(u"Nest Heads", u"http://www.gocomics.com/nestheads"), + #(u"NEUROTICA", u"http://www.gocomics.com/neurotica"), + #(u"New Adventures of Queen Victoria", u"http://www.gocomics.com/thenewadventuresofqueenvictoria"), + (u"Non Sequitur", u"http://www.gocomics.com/nonsequitur"), + #(u"Off The Mark", u"http://www.gocomics.com/offthemark"), + #(u"On A Claire Day", u"http://www.gocomics.com/onaclaireday"), + #(u"One Big Happy Classics", u"http://www.gocomics.com/onebighappyclassics"), + #(u"One Big Happy", u"http://www.gocomics.com/onebighappy"), + #(u"Out of the Gene Pool Re-Runs", u"http://www.gocomics.com/outofthegenepool"), + #(u"Over the Hedge", u"http://www.gocomics.com/overthehedge"), + #(u"Overboard", u"http://www.gocomics.com/overboard"), + #(u"PC and Pixel", u"http://www.gocomics.com/pcandpixel"), + (u"Peanuts", u"http://www.gocomics.com/peanuts"), + (u"Pearls Before Swine", u"http://www.gocomics.com/pearlsbeforeswine"), + #(u"Pibgorn Sketches", u"http://www.gocomics.com/pibgornsketches"), + #(u"Pibgorn", u"http://www.gocomics.com/pibgorn"), + #(u"Pickles", u"http://www.gocomics.com/pickles"), + #(u"Pinkerton", u"http://www.gocomics.com/pinkerton"), + #(u"Pluggers", u"http://www.gocomics.com/pluggers"), + (u"Pooch Cafe", u"http://www.gocomics.com/poochcafe"), + #(u"PreTeena", u"http://www.gocomics.com/preteena"), + #(u"Prickly City", u"http://www.gocomics.com/pricklycity"), + #(u"Rabbits Against Magic", u"http://www.gocomics.com/rabbitsagainstmagic"), + #(u"Raising Duncan Classics", u"http://www.gocomics.com/raisingduncanclassics"), + #(u"Real Life Adventures", u"http://www.gocomics.com/reallifeadventures"), + #(u"Reality Check", u"http://www.gocomics.com/realitycheck"), + #(u"Red and Rover", u"http://www.gocomics.com/redandrover"), + #(u"Red Meat", u"http://www.gocomics.com/redmeat"), + #(u"Reynolds Unwrapped", u"http://www.gocomics.com/reynoldsunwrapped"), + #(u"Rip Haywire", u"http://www.gocomics.com/riphaywire"), + #(u"Ronaldinho Gaucho", u"http://www.gocomics.com/ronaldinhogaucho"), + (u"Rose Is Rose", u"http://www.gocomics.com/roseisrose"), + #(u"Rudy Park", u"http://www.gocomics.com/rudypark"), + #(u"Scary Gary", u"http://www.gocomics.com/scarygary"), + #(u"Shirley and Son Classics", u"http://www.gocomics.com/shirleyandsonclassics"), + (u"Shoe", u"http://www.gocomics.com/shoe"), + #(u"Shoecabbage", u"http://www.gocomics.com/shoecabbage"), + #(u"Skin Horse", u"http://www.gocomics.com/skinhorse"), + #(u"Slowpoke", u"http://www.gocomics.com/slowpoke"), + #(u"Soup To Nutz", u"http://www.gocomics.com/souptonutz"), + #(u"Spot The Frog", u"http://www.gocomics.com/spotthefrog"), + #(u"State of the Union", u"http://www.gocomics.com/stateoftheunion"), + #(u"Stone Soup", u"http://www.gocomics.com/stonesoup"), + #(u"Sylvia", u"http://www.gocomics.com/sylvia"), + #(u"Tank McNamara", u"http://www.gocomics.com/tankmcnamara"), + #(u"Tarzan Classics", u"http://www.gocomics.com/tarzanclassics"), + #(u"That's Life", u"http://www.gocomics.com/thatslife"), + #(u"The Academia Waltz", u"http://www.gocomics.com/academiawaltz"), + #(u"The Barn", u"http://www.gocomics.com/thebarn"), + #(u"The Boiling Point", u"http://www.gocomics.com/theboilingpoint"), + #(u"The Boondocks", u"http://www.gocomics.com/boondocks"), + (u"The Born Loser", u"http://www.gocomics.com/thebornloser"), + #(u"The Buckets", u"http://www.gocomics.com/thebuckets"), + #(u"The City", u"http://www.gocomics.com/thecity"), + #(u"The Dinette Set", u"http://www.gocomics.com/dinetteset"), + #(u"The Doozies", u"http://www.gocomics.com/thedoozies"), + #(u"The Duplex", u"http://www.gocomics.com/duplex"), + #(u"The Elderberries", u"http://www.gocomics.com/theelderberries"), + #(u"The Flying McCoys", u"http://www.gocomics.com/theflyingmccoys"), + #(u"The Fusco Brothers", u"http://www.gocomics.com/thefuscobrothers"), + #(u"The Grizzwells", u"http://www.gocomics.com/thegrizzwells"), + #(u"The Humble Stumble", u"http://www.gocomics.com/thehumblestumble"), + #(u"The Knight Life", u"http://www.gocomics.com/theknightlife"), + #(u"The Meaning of Lila", u"http://www.gocomics.com/meaningoflila"), + (u"The Middletons", u"http://www.gocomics.com/themiddletons"), + #(u"The Norm", u"http://www.gocomics.com/thenorm"), + #(u"The Other Coast", u"http://www.gocomics.com/theothercoast"), + #(u"The Quigmans", u"http://www.gocomics.com/thequigmans"), + #(u"The Sunshine Club", u"http://www.gocomics.com/thesunshineclub"), + #(u"Tiny Sepuk", u"http://www.gocomics.com/tinysepuk"), + #(u"TOBY", u"http://www.gocomics.com/toby"), + #(u"Tom the Dancing Bug", u"http://www.gocomics.com/tomthedancingbug"), + #(u"Too Much Coffee Man", u"http://www.gocomics.com/toomuchcoffeeman"), + #(u"Unstrange Phenomena", u"http://www.gocomics.com/unstrangephenomena"), + #(u"W.T. Duck", u"http://www.gocomics.com/wtduck"), + #(u"Watch Your Head", u"http://www.gocomics.com/watchyourhead"), + #(u"Wee Pals", u"http://www.gocomics.com/weepals"), + #(u"Winnie the Pooh", u"http://www.gocomics.com/winniethepooh"), + (u"Wizard of Id", u"http://www.gocomics.com/wizardofid"), + #(u"Working Daze", u"http://www.gocomics.com/workingdaze"), + #(u"Working It Out", u"http://www.gocomics.com/workingitout"), + #(u"Yenny", u"http://www.gocomics.com/yenny"), + #(u"Zack Hill", u"http://www.gocomics.com/zackhill"), + #(u"Ziggy", u"http://www.gocomics.com/ziggy"), + (u"9 to 5", u"http://www.gocomics.com/9to5"), + (u"Heathcliff", u"http://www.gocomics.com/heathcliff"), + (u"Herman", u"http://www.gocomics.com/herman"), + (u"Loose Parts", u"http://www.gocomics.com/looseparts"), + (u"Marmaduke", u"http://www.gocomics.com/marmaduke"), + (u"Ripley's Believe It or Not!", u"http://www.gocomics.com/ripleysbelieveitornot"), + (u"Rubes", u"http://www.gocomics.com/rubes"), + (u"Speed Bump", u"http://www.gocomics.com/speedbump"), + (u"Strange Brew", u"http://www.gocomics.com/strangebrew"), + (u"The Argyle Sweater", u"http://www.gocomics.com/theargylesweater"), + # + ######## EDITORIAL CARTOONS ##################### + #(u"Adam Zyglis", u"http://www.gocomics.com/adamzyglis"), + #(u"Andy Singer", u"http://www.gocomics.com/andysinger"), + #(u"Ben Sargent",u"http://www.gocomics.com/bensargent"), + #(u"Bill Day", u"http://www.gocomics.com/billday"), + #(u"Bill Schorr", u"http://www.gocomics.com/billschorr"), + #(u"Bob Englehart", u"http://www.gocomics.com/bobenglehart"), + #(u"Bob Gorrell",u"http://www.gocomics.com/bobgorrell"), + #(u"Brian Fairrington", u"http://www.gocomics.com/brianfairrington"), + #(u"Bruce Beattie", u"http://www.gocomics.com/brucebeattie"), + #(u"Cam Cardow", u"http://www.gocomics.com/camcardow"), + #(u"Chan Lowe",u"http://www.gocomics.com/chanlowe"), + #(u"Chip Bok",u"http://www.gocomics.com/chipbok"), + #(u"Chris Britt",u"http://www.gocomics.com/chrisbritt"), + #(u"Chuck Asay",u"http://www.gocomics.com/chuckasay"), + #(u"Clay Bennett",u"http://www.gocomics.com/claybennett"), + #(u"Clay Jones",u"http://www.gocomics.com/clayjones"), + #(u"Dan Wasserman",u"http://www.gocomics.com/danwasserman"), + #(u"Dana Summers",u"http://www.gocomics.com/danasummers"), + #(u"Daryl Cagle", u"http://www.gocomics.com/darylcagle"), + #(u"David Fitzsimmons", u"http://www.gocomics.com/davidfitzsimmons"), + #(u"Dick Locher",u"http://www.gocomics.com/dicklocher"), + #(u"Don Wright",u"http://www.gocomics.com/donwright"), + #(u"Donna Barstow",u"http://www.gocomics.com/donnabarstow"), + #(u"Drew Litton", u"http://www.gocomics.com/drewlitton"), + #(u"Drew Sheneman",u"http://www.gocomics.com/drewsheneman"), + #(u"Ed Stein", u"http://www.gocomics.com/edstein"), + #(u"Eric Allie", u"http://www.gocomics.com/ericallie"), + #(u"Gary Markstein", u"http://www.gocomics.com/garymarkstein"), + #(u"Gary McCoy", u"http://www.gocomics.com/garymccoy"), + #(u"Gary Varvel", u"http://www.gocomics.com/garyvarvel"), + #(u"Glenn McCoy",u"http://www.gocomics.com/glennmccoy"), + #(u"Henry Payne", u"http://www.gocomics.com/henrypayne"), + #(u"Jack Ohman",u"http://www.gocomics.com/jackohman"), + #(u"JD Crowe", u"http://www.gocomics.com/jdcrowe"), + #(u"Jeff Danziger",u"http://www.gocomics.com/jeffdanziger"), + #(u"Jeff Parker", u"http://www.gocomics.com/jeffparker"), + #(u"Jeff Stahler", u"http://www.gocomics.com/jeffstahler"), + #(u"Jerry Holbert", u"http://www.gocomics.com/jerryholbert"), + #(u"Jim Morin",u"http://www.gocomics.com/jimmorin"), + #(u"Joel Pett",u"http://www.gocomics.com/joelpett"), + #(u"John Cole", u"http://www.gocomics.com/johncole"), + #(u"John Darkow", u"http://www.gocomics.com/johndarkow"), + #(u"John Deering",u"http://www.gocomics.com/johndeering"), + #(u"John Sherffius", u"http://www.gocomics.com/johnsherffius"), + #(u"Ken Catalino",u"http://www.gocomics.com/kencatalino"), + #(u"Kerry Waghorn",u"http://www.gocomics.com/facesinthenews"), + #(u"Kevin Kallaugher",u"http://www.gocomics.com/kevinkallaugher"), + #(u"Lalo Alcaraz",u"http://www.gocomics.com/laloalcaraz"), + #(u"Larry Wright", u"http://www.gocomics.com/larrywright"), + #(u"Lisa Benson", u"http://www.gocomics.com/lisabenson"), + #(u"Marshall Ramsey", u"http://www.gocomics.com/marshallramsey"), + #(u"Matt Bors", u"http://www.gocomics.com/mattbors"), + #(u"Matt Davies",u"http://www.gocomics.com/mattdavies"), + #(u"Michael Ramirez", u"http://www.gocomics.com/michaelramirez"), + #(u"Mike Keefe", u"http://www.gocomics.com/mikekeefe"), + #(u"Mike Luckovich", u"http://www.gocomics.com/mikeluckovich"), + #(u"MIke Thompson", u"http://www.gocomics.com/mikethompson"), + #(u"Monte Wolverton", u"http://www.gocomics.com/montewolverton"), + #(u"Mr. Fish", u"http://www.gocomics.com/mrfish"), + #(u"Nate Beeler", u"http://www.gocomics.com/natebeeler"), + #(u"Nick Anderson", u"http://www.gocomics.com/nickanderson"), + #(u"Pat Bagley", u"http://www.gocomics.com/patbagley"), + #(u"Pat Oliphant",u"http://www.gocomics.com/patoliphant"), + #(u"Paul Conrad",u"http://www.gocomics.com/paulconrad"), + #(u"Paul Szep", u"http://www.gocomics.com/paulszep"), + #(u"RJ Matson", u"http://www.gocomics.com/rjmatson"), + #(u"Rob Rogers", u"http://www.gocomics.com/robrogers"), + #(u"Robert Ariail", u"http://www.gocomics.com/robertariail"), + #(u"Scott Stantis", u"http://www.gocomics.com/scottstantis"), + #(u"Signe Wilkinson", u"http://www.gocomics.com/signewilkinson"), + #(u"Small World",u"http://www.gocomics.com/smallworld"), + #(u"Steve Benson", u"http://www.gocomics.com/stevebenson"), + #(u"Steve Breen", u"http://www.gocomics.com/stevebreen"), + #(u"Steve Kelley", u"http://www.gocomics.com/stevekelley"), + #(u"Steve Sack", u"http://www.gocomics.com/stevesack"), + #(u"Stuart Carlson",u"http://www.gocomics.com/stuartcarlson"), + #(u"Ted Rall",u"http://www.gocomics.com/tedrall"), + #(u"(Th)ink", u"http://www.gocomics.com/think"), + #(u"Tom Toles",u"http://www.gocomics.com/tomtoles"), + #(u"Tony Auth",u"http://www.gocomics.com/tonyauth"), + #(u"Views of the World",u"http://www.gocomics.com/viewsoftheworld"), + #(u"ViewsAfrica",u"http://www.gocomics.com/viewsafrica"), + #(u"ViewsAmerica",u"http://www.gocomics.com/viewsamerica"), + #(u"ViewsAsia",u"http://www.gocomics.com/viewsasia"), + #(u"ViewsBusiness",u"http://www.gocomics.com/viewsbusiness"), + #(u"ViewsEurope",u"http://www.gocomics.com/viewseurope"), + #(u"ViewsLatinAmerica",u"http://www.gocomics.com/viewslatinamerica"), + #(u"ViewsMidEast",u"http://www.gocomics.com/viewsmideast"), + #(u"Walt Handelsman",u"http://www.gocomics.com/walthandelsman"), + #(u"Wayne Stayskal",u"http://www.gocomics.com/waynestayskal"), + #(u"Wit of the World",u"http://www.gocomics.com/witoftheworld"), + ]: + print 'Working on: ', title articles = self.make_links(url) if articles: feeds.append((title, articles)) return feeds def make_links(self, url): - soup = self.index_to_soup(url) - # print 'soup: ', soup - title = '' + title = 'Temp' current_articles = [] - from datetime import datetime, timedelta - now = datetime.now() - dates = [(now-timedelta(days=d)).strftime('%Y/%m/%d') for d in range(self.num_comics_to_get)] - - for page in dates: - page_url = url + '/' + str(page) - print(page_url) - soup = self.index_to_soup(page_url) - if soup: - strip_tag = self.tag_to_string(soup.find('a')) - if strip_tag: - print 'strip_tag: ', strip_tag - title = strip_tag - print 'title: ', title + pages = range(1, self.num_comics_to_get+1) + for page in pages: + page_soup = self.index_to_soup(url) + if page_soup: + try: + strip_title = page_soup.find(name='div', attrs={'class':'top'}).h1.a.string + except: + strip_title = 'Error - no Title found' + try: + date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string + if not date_title: + date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string + except: + date_title = 'Error - no Date found' + title = strip_title + ' - ' + date_title + for i in range(2): + try: + strip_url_date = page_soup.find(name='div', attrs={'class':'top'}).h1.a['href'] + break # success - this is normal exit + except: + strip_url_date = None + continue # try to get strip_url_date again + for i in range(2): + try: + prev_strip_url_date = page_soup.find('a', attrs={'class': 'prev'})['href'] + break # success - this is normal exit + except: + prev_strip_url_date = None + continue # try to get prev_strip_url_date again + if strip_url_date: + page_url = 'http://www.gocomics.com' + strip_url_date + else: + continue + if prev_strip_url_date: + prev_page_url = 'http://www.gocomics.com' + prev_strip_url_date + else: + continue current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''}) + url = prev_page_url current_articles.reverse() return current_articles + def preprocess_html(self, soup): + if soup.title: + title_string = soup.title.string.strip() + _cd = title_string.split(',',1)[1] + comic_date = ' '.join(_cd.split(' ', 4)[0:-1]) + if soup.h1.span: + artist = soup.h1.span.string + soup.h1.span.string.replaceWith(comic_date + artist) + feature_item = soup.find('p',attrs={'class':'feature_item'}) + if feature_item.a: + a_tag = feature_item.a + a_href = a_tag["href"] + img_tag = a_tag.img + img_tag["src"] = a_href + img_tag["width"] = self.comic_size + img_tag["height"] = None + return self.adeify_images(soup) + extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} + img {max-width:100%; min-width:100%;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' + ''' + diff --git a/recipes/kdefamily_pl.recipe b/recipes/kdefamily_pl.recipe index df0aacc2df..0e03ea75aa 100644 --- a/recipes/kdefamily_pl.recipe +++ b/recipes/kdefamily_pl.recipe @@ -12,5 +12,6 @@ class KDEFamilyPl(BasicNewsRecipe): max_articles_per_feed = 100 preprocess_regexps = [(re.compile(r"Podobne wpisy.*", re.IGNORECASE|re.DOTALL), lambda m: '')] no_stylesheets = True + remove_empty_feeds = True use_embedded_content = True feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')] \ No newline at end of file diff --git a/recipes/legeartis.recipe b/recipes/legeartis.recipe index 1b882c26d7..b2e1b0e782 100644 --- a/recipes/legeartis.recipe +++ b/recipes/legeartis.recipe @@ -21,7 +21,7 @@ class LegeArtisRecipe(BasicNewsRecipe): no_stylesheets = True remove_javascript = True - + remove_empty_feeds = True extra_css = ''' img{clear: both;} ''' diff --git a/recipes/lomza.recipe b/recipes/lomza.recipe index 2c31271624..ed5c513430 100644 --- a/recipes/lomza.recipe +++ b/recipes/lomza.recipe @@ -8,6 +8,7 @@ class Lomza(BasicNewsRecipe): language = 'pl' oldest_article = 15 no_stylesheets = True + extra_css = '#foto {float: right; max-width: 200px; margin-left: 10px;} #fotogaleria > div {float:left;} .br {clear: both;}' max_articles_per_feed = 100 remove_tags=[dict(name='div', attrs={'class':['bxbanner', 'drukuj', 'wyslijznajomemu']})] keep_only_tags=[dict(name='div', attrs={'class':'wiadomosc'})] diff --git a/recipes/pc_lab.recipe b/recipes/pc_lab.recipe index 7a6038bd65..e7c6bcb654 100644 --- a/recipes/pc_lab.recipe +++ b/recipes/pc_lab.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python from calibre.web.feeds.recipes import BasicNewsRecipe - +from calibre.ebooks.BeautifulSoup import Comment class PCLab(BasicNewsRecipe): cover_url = 'http://pclab.pl/img/logo.png' title = u"PC Lab" @@ -52,6 +52,9 @@ class PCLab(BasicNewsRecipe): pager = soup2.find('div', attrs={'class':'next'}) pagetext = soup2.find('div', attrs={'class':'substance'}) pagetext = pagetext.find('div', attrs={'class':'data'}) + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) diff --git a/recipes/swiatkindle.recipe b/recipes/swiatkindle.recipe index c589d1b6e1..d5774898aa 100644 --- a/recipes/swiatkindle.recipe +++ b/recipes/swiatkindle.recipe @@ -10,7 +10,7 @@ import re from calibre.web.feeds.news import BasicNewsRecipe class swiatczytnikow(BasicNewsRecipe): - title = u'Swiat Czytnikow' + title = u'Świat Czytników' description = u'Czytniki e-książek w Polsce. Jak wybrać, kupić i korzystać z Amazon Kindle i innych' language = 'pl' __author__ = u'Tomasz D\u0142ugosz' diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 9851d76af4..368470cced 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -531,3 +531,8 @@ numeric_collation = False # number here. The default is ten libraries. many_libraries = 10 +#: Highlight the count of books when using a Virtual Library +# The count of books next to the Virtual Library button is highlighted in +# yellow when using a Virtual Library. By setting this to False, you can turn +# that off. +highlight_virtual_library_book_count = True diff --git a/setup/iso_639/ca.po b/setup/iso_639/ca.po index 959a09a192..8f1d62cfb1 100644 --- a/setup/iso_639/ca.po +++ b/setup/iso_639/ca.po @@ -2024,7 +2024,7 @@ msgstr "Àzeri meridional" #. name for aze msgid "Azerbaijani" -msgstr "Serbi" +msgstr "" #. name for azg msgid "Amuzgo; San Pedro Amuzgos" @@ -7288,7 +7288,7 @@ msgstr "Epie" #. name for epo msgid "Esperanto" -msgstr "Alemany" +msgstr "Esperanto" #. name for era msgid "Eravallan" @@ -21816,7 +21816,7 @@ msgstr "Ramoaaina" #. name for raj msgid "Rajasthani" -msgstr "Marwari" +msgstr "" #. name for rak msgid "Tulu-Bohuai" diff --git a/setup/iso_639/cs.po b/setup/iso_639/cs.po index 26cde97d21..4d088c7467 100644 --- a/setup/iso_639/cs.po +++ b/setup/iso_639/cs.po @@ -13762,7 +13762,7 @@ msgstr "" #. name for lav msgid "Latvian" -msgstr "litevština" +msgstr "" #. name for law msgid "Lauje" diff --git a/setup/iso_639/da.po b/setup/iso_639/da.po index 9f80f6f2fb..0e045d8cc4 100644 --- a/setup/iso_639/da.po +++ b/setup/iso_639/da.po @@ -1429,7 +1429,7 @@ msgstr "" #. name for arg msgid "Aragonese" -msgstr "Færøsk" +msgstr "" #. name for arh msgid "Arhuaco" diff --git a/setup/iso_639/de.po b/setup/iso_639/de.po index 1e5c880379..813120efc6 100644 --- a/setup/iso_639/de.po +++ b/setup/iso_639/de.po @@ -319,7 +319,7 @@ msgstr "Adangme" #. name for adb msgid "Adabe" -msgstr "Adangme" +msgstr "Adabe" #. name for add msgid "Dzodinka" @@ -367,7 +367,7 @@ msgstr "Adap" #. name for adq msgid "Adangbe" -msgstr "Adangme" +msgstr "Adangbe" #. name for adr msgid "Adonara" diff --git a/setup/iso_639/eu.po b/setup/iso_639/eu.po index 92b3057e51..d8652d91e0 100644 --- a/setup/iso_639/eu.po +++ b/setup/iso_639/eu.po @@ -2022,7 +2022,7 @@ msgstr "" #. name for aze msgid "Azerbaijani" -msgstr "Turkiera" +msgstr "" #. name for azg msgid "Amuzgo; San Pedro Amuzgos" @@ -13126,7 +13126,7 @@ msgstr "" #. name for kur msgid "Kurdish" -msgstr "Turkiera" +msgstr "" #. name for kus msgid "Kusaal" @@ -16190,7 +16190,7 @@ msgstr "" #. name for mlt msgid "Maltese" -msgstr "Koreera" +msgstr "" #. name for mlu msgid "To'abaita" diff --git a/setup/iso_639/gl.po b/setup/iso_639/gl.po index dca7bcdbd6..29ba6d291e 100644 --- a/setup/iso_639/gl.po +++ b/setup/iso_639/gl.po @@ -13764,7 +13764,7 @@ msgstr "Laba" #. name for lav msgid "Latvian" -msgstr "Lituano" +msgstr "" #. name for law msgid "Lauje" @@ -22212,7 +22212,7 @@ msgstr "Roglai do norte" #. name for roh msgid "Romansh" -msgstr "Romanés" +msgstr "" #. name for rol msgid "Romblomanon" diff --git a/setup/iso_639/hu.po b/setup/iso_639/hu.po index fa912f715f..b6fe5d12ff 100644 --- a/setup/iso_639/hu.po +++ b/setup/iso_639/hu.po @@ -20538,7 +20538,7 @@ msgstr "" #. name for peo msgid "Persian; Old (ca. 600-400 B.C.)" -msgstr "perzsa" +msgstr "" #. name for pep msgid "Kunja" diff --git a/setup/iso_639/is.po b/setup/iso_639/is.po index 1005120ba0..067b15b33b 100644 --- a/setup/iso_639/is.po +++ b/setup/iso_639/is.po @@ -15049,7 +15049,7 @@ msgstr "Magahi" #. name for mah msgid "Marshallese" -msgstr "Maltneska" +msgstr "" #. name for mai msgid "Maithili" diff --git a/setup/iso_639/ko.po b/setup/iso_639/ko.po index e4bccbb8f7..046bd1c1d6 100644 --- a/setup/iso_639/ko.po +++ b/setup/iso_639/ko.po @@ -3742,7 +3742,7 @@ msgstr "" #. name for bre msgid "Breton" -msgstr "프랑스어" +msgstr "" #. name for brf msgid "Bera" diff --git a/setup/iso_639/mr.po b/setup/iso_639/mr.po index beef0a1c61..56441979de 100644 --- a/setup/iso_639/mr.po +++ b/setup/iso_639/mr.po @@ -6804,7 +6804,7 @@ msgstr "डोगोन; तेबुल उरे" #. name for dua msgid "Duala" -msgstr "ड्युला" +msgstr "" #. name for dub msgid "Dubli" diff --git a/setup/iso_639/nb.po b/setup/iso_639/nb.po index 66527ebb47..2d86f9d25f 100644 --- a/setup/iso_639/nb.po +++ b/setup/iso_639/nb.po @@ -27790,7 +27790,7 @@ msgstr "" #. name for wln msgid "Walloon" -msgstr "Vietnamesisk" +msgstr "" #. name for wlo msgid "Wolio" diff --git a/setup/iso_639/oc.po b/setup/iso_639/oc.po index f73ce842b9..03eb65016a 100644 --- a/setup/iso_639/oc.po +++ b/setup/iso_639/oc.po @@ -9862,7 +9862,7 @@ msgstr "Hya" #. name for hye msgid "Armenian" -msgstr "Albanés" +msgstr "" #. name for iai msgid "Iaai" @@ -13762,7 +13762,7 @@ msgstr "Laba" #. name for lav msgid "Latvian" -msgstr "Lituanian" +msgstr "" #. name for law msgid "Lauje" diff --git a/setup/iso_639/ru.po b/setup/iso_639/ru.po index ecc5f756c2..b7de34435f 100644 --- a/setup/iso_639/ru.po +++ b/setup/iso_639/ru.po @@ -2089,7 +2089,7 @@ msgstr "Башкирский" #. name for bal msgid "Baluchi" -msgstr "Балийский" +msgstr "" #. name for bam msgid "Bambara" diff --git a/setup/iso_639/sk.po b/setup/iso_639/sk.po index e7e208efc4..f949e89803 100644 --- a/setup/iso_639/sk.po +++ b/setup/iso_639/sk.po @@ -13763,7 +13763,7 @@ msgstr "" #. name for lav msgid "Latvian" -msgstr "Lotyšský" +msgstr "" #. name for law msgid "Lauje" diff --git a/setup/iso_639/zh_CN.po b/setup/iso_639/zh_CN.po index fecc4d798e..cd119adb33 100644 --- a/setup/iso_639/zh_CN.po +++ b/setup/iso_639/zh_CN.po @@ -1016,7 +1016,7 @@ msgstr "" #. name for amh msgid "Amharic" -msgstr "阿拉伯语" +msgstr "" #. name for ami msgid "Amis" diff --git a/setup/translations.py b/setup/translations.py index e0a512d21c..d151e51ebc 100644 --- a/setup/translations.py +++ b/setup/translations.py @@ -18,7 +18,7 @@ def qt_sources(): 'src/gui/widgets/qdialogbuttonbox.cpp', ])) -class POT(Command): # {{{ +class POT(Command): # {{{ description = 'Update the .pot translation template' PATH = os.path.join(Command.SRC, __appname__, 'translations') @@ -63,7 +63,6 @@ class POT(Command): # {{{ return '\n'.join(ans) - def run(self, opts): pot_header = textwrap.dedent('''\ # Translation template file.. @@ -117,11 +116,10 @@ class POT(Command): # {{{ f.write(src) self.info('Translations template:', os.path.abspath(pot)) - return pot # }}} -class Translations(POT): # {{{ +class Translations(POT): # {{{ description='''Compile the translations''' DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization', 'locales') @@ -134,6 +132,7 @@ class Translations(POT): # {{{ return locale, os.path.join(self.DEST, locale, 'messages.mo') def run(self, opts): + self.iso639_errors = [] for f in self.po_files(): locale, dest = self.mo_file(f) base = os.path.dirname(dest) @@ -146,18 +145,46 @@ class Translations(POT): # {{{ '%s.po'%iscpo) if os.path.exists(iso639): + self.check_iso639(iso639) dest = self.j(self.d(dest), 'iso639.mo') if self.newer(dest, iso639): - self.info('\tCopying ISO 639 translations') + self.info('\tCopying ISO 639 translations for %s' % iscpo) subprocess.check_call(['msgfmt', '-o', dest, iso639]) elif locale not in ('en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc', 'ltg', 'nds', 'te', 'yi', 'fo', 'sq', 'ast', 'ml', 'ku', 'fr_CA', 'him', 'jv', 'ka', 'fur', 'ber'): self.warn('No ISO 639 translations for locale:', locale) + if self.iso639_errors: + for err in self.iso639_errors: + print (err) + raise SystemExit(1) + self.write_stats() self.freeze_locales() + def check_iso639(self, path): + from calibre.utils.localization import langnames_to_langcodes + with open(path, 'rb') as f: + raw = f.read() + rmap = {} + msgid = None + for match in re.finditer(r'^(msgid|msgstr)\s+"(.*?)"', raw, re.M): + if match.group(1) == 'msgid': + msgid = match.group(2) + else: + msgstr = match.group(2) + if not msgstr: + continue + omsgid = rmap.get(msgstr, None) + if omsgid is not None: + cm = langnames_to_langcodes([omsgid, msgid]) + if cm[msgid] and cm[omsgid] and cm[msgid] != cm[omsgid]: + self.iso639_errors.append('In file %s the name %s is used as translation for both %s and %s' % ( + os.path.basename(path), msgstr, msgid, rmap[msgstr])) + # raise SystemExit(1) + rmap[msgstr] = msgid + def freeze_locales(self): zf = self.DEST + '.zip' from calibre import CurrentDir @@ -191,7 +218,6 @@ class Translations(POT): # {{{ locale = self.mo_file(f)[0] stats[locale] = min(1.0, float(trans)/total) - import cPickle cPickle.dump(stats, open(dest, 'wb'), -1) @@ -211,7 +237,7 @@ class Translations(POT): # {{{ # }}} -class GetTranslations(Translations): # {{{ +class GetTranslations(Translations): # {{{ description = 'Get updated translations from Launchpad' BRANCH = 'lp:~kovid/calibre/translations' @@ -286,7 +312,7 @@ class GetTranslations(Translations): # {{{ # }}} -class ISO639(Command): # {{{ +class ISO639(Command): # {{{ description = 'Compile translations for ISO 639 codes' DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization', diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index da90452d0f..289a192b83 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1476,7 +1476,6 @@ class StoreKoobeStore(StoreBase): drm_free_only = True headquarters = 'PL' formats = ['EPUB', 'MOBI', 'PDF'] - affiliate = True class StoreLegimiStore(StoreBase): name = 'Legimi' @@ -1660,7 +1659,6 @@ class StoreWoblinkStore(StoreBase): headquarters = 'PL' formats = ['EPUB', 'MOBI', 'PDF', 'WOBLINK'] - affiliate = True class XinXiiStore(StoreBase): name = 'XinXii' diff --git a/src/calibre/devices/blackberry/driver.py b/src/calibre/devices/blackberry/driver.py index 4a85fa695a..6c3111cb3c 100644 --- a/src/calibre/devices/blackberry/driver.py +++ b/src/calibre/devices/blackberry/driver.py @@ -19,10 +19,10 @@ class BLACKBERRY(USBMS): VENDOR_ID = [0x0fca] PRODUCT_ID = [0x8004, 0x0004] - BCD = [0x0200, 0x0107, 0x0210, 0x0201, 0x0211, 0x0220] + BCD = [0x0200, 0x0107, 0x0210, 0x0201, 0x0211, 0x0220, 0x232] VENDOR_NAME = 'RIM' - WINDOWS_MAIN_MEM = 'BLACKBERRY_SD' + WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['BLACKBERRY_SD', 'BLACKBERRY'] MAIN_MEMORY_VOLUME_LABEL = 'Blackberry SD Card' diff --git a/src/calibre/ebooks/conversion/plugins/txt_input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py index 046b362ae7..a8e18aad11 100644 --- a/src/calibre/ebooks/conversion/plugins/txt_input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -91,14 +91,15 @@ class TXTInput(InputFormatPlugin): log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt) + det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence'] if det_encoding and det_encoding.lower().replace('_', '-').strip() in ( 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' - ienc = det_encoding['encoding'] - log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100)) + ienc = det_encoding + log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug('No input encoding specified and could not auto detect using %s' % ienc) diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index cae22e086c..ec0decacef 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -105,6 +105,9 @@ class DOCX(object): name = os.path.relpath(f, self.tdir).replace(os.sep, '/') self.names[name] = f + def exists(self, name): + return name in self.names + def read(self, name): if hasattr(self, 'zipf'): return self.zipf.open(name).read() @@ -149,14 +152,39 @@ class DOCX(object): self.relationships_rmap[target] = typ @property - def document(self): + def document_name(self): name = self.relationships.get(DOCUMENT, None) if name is None: names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) if not names: raise InvalidDOCX('The file %s docx file has no main document' % self.name) name = names[0] - return fromstring(self.read(name)) + return name + + @property + def document(self): + return fromstring(self.read(self.document_name)) + + @property + def document_relationships(self): + name = self.document_name + base = '/'.join(name.split('/')[:-1]) + by_id, by_type = {}, {} + parts = name.split('/') + name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels']) + try: + raw = self.read(name) + except KeyError: + pass + else: + root = fromstring(raw) + for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): + target = '/'.join((base, item.get('Target').lstrip('/'))) + typ = item.get('Type') + Id = item.get('Id') + by_id[Id] = by_type[typ] = target + + return by_id, by_type @property def metadata(self): diff --git a/src/calibre/ebooks/docx/lcid.py b/src/calibre/ebooks/docx/lcid.py new file mode 100644 index 0000000000..293212ab8b --- /dev/null +++ b/src/calibre/ebooks/docx/lcid.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +lcid = { + 1078: 'af', # Afrikaans - South Africa + 1052: 'sq', # Albanian - Albania + 1118: 'am', # Amharic - Ethiopia + 1025: 'ar', # Arabic - Saudi Arabia + 5121: 'ar', # Arabic - Algeria + 15361: 'ar', # Arabic - Bahrain + 3073: 'ar', # Arabic - Egypt + 2049: 'ar', # Arabic - Iraq + 11265: 'ar', # Arabic - Jordan + 13313: 'ar', # Arabic - Kuwait + 12289: 'ar', # Arabic - Lebanon + 4097: 'ar', # Arabic - Libya + 6145: 'ar', # Arabic - Morocco + 8193: 'ar', # Arabic - Oman + 16385: 'ar', # Arabic - Qatar + 10241: 'ar', # Arabic - Syria + 7169: 'ar', # Arabic - Tunisia + 14337: 'ar', # Arabic - U.A.E. + 9217: 'ar', # Arabic - Yemen + 1067: 'hy', # Armenian - Armenia + 1101: 'as', # Assamese + 2092: 'az', # Azeri (Cyrillic) + 1068: 'az', # Azeri (Latin) + 1069: 'eu', # Basque + 1059: 'be', # Belarusian + 1093: 'bn', # Bengali (India) + 2117: 'bn', # Bengali (Bangladesh) + 5146: 'bs', # Bosnian (Bosnia/Herzegovina) + 1026: 'bg', # Bulgarian + 1109: 'my', # Burmese + 1027: 'ca', # Catalan + 1116: 'chr', # Cherokee - United States + 2052: 'zh', # Chinese - People's Republic of China + 4100: 'zh', # Chinese - Singapore + 1028: 'zh', # Chinese - Taiwan + 3076: 'zh', # Chinese - Hong Kong SAR + 5124: 'zh', # Chinese - Macao SAR + 1050: 'hr', # Croatian + 4122: 'hr', # Croatian (Bosnia/Herzegovina) + 1029: 'cs', # Czech + 1030: 'da', # Danish + 1125: 'dv', # Divehi + 1043: 'nl', # Dutch - Netherlands + 2067: 'nl', # Dutch - Belgium + 1126: 'bin', # Edo + 1033: 'en', # English - United States + 2057: 'en', # English - United Kingdom + 3081: 'en', # English - Australia + 10249: 'en', # English - Belize + 4105: 'en', # English - Canada + 9225: 'en', # English - Caribbean + 15369: 'en', # English - Hong Kong SAR + 16393: 'en', # English - India + 14345: 'en', # English - Indonesia + 6153: 'en', # English - Ireland + 8201: 'en', # English - Jamaica + 17417: 'en', # English - Malaysia + 5129: 'en', # English - New Zealand + 13321: 'en', # English - Philippines + 18441: 'en', # English - Singapore + 7177: 'en', # English - South Africa + 11273: 'en', # English - Trinidad + 12297: 'en', # English - Zimbabwe + 1061: 'et', # Estonian + 1080: 'fo', # Faroese + 1065: None, # TODO: Farsi + 1124: 'fil', # Filipino + 1035: 'fi', # Finnish + 1036: 'fr', # French - France + 2060: 'fr', # French - Belgium + 11276: 'fr', # French - Cameroon + 3084: 'fr', # French - Canada + 9228: 'fr', # French - Democratic Rep. of Congo + 12300: 'fr', # French - Cote d'Ivoire + 15372: 'fr', # French - Haiti + 5132: 'fr', # French - Luxembourg + 13324: 'fr', # French - Mali + 6156: 'fr', # French - Monaco + 14348: 'fr', # French - Morocco + 58380: 'fr', # French - North Africa + 8204: 'fr', # French - Reunion + 10252: 'fr', # French - Senegal + 4108: 'fr', # French - Switzerland + 7180: 'fr', # French - West Indies + 1122: 'fy', # Frisian - Netherlands + 1127: None, # TODO: Fulfulde - Nigeria + 1071: 'mk', # FYRO Macedonian + 2108: 'ga', # Gaelic (Ireland) + 1084: 'gd', # Gaelic (Scotland) + 1110: 'gl', # Galician + 1079: 'ka', # Georgian + 1031: 'de', # German - Germany + 3079: 'de', # German - Austria + 5127: 'de', # German - Liechtenstein + 4103: 'de', # German - Luxembourg + 2055: 'de', # German - Switzerland + 1032: 'el', # Greek + 1140: 'gn', # Guarani - Paraguay + 1095: 'gu', # Gujarati + 1128: 'ha', # Hausa - Nigeria + 1141: 'haw', # Hawaiian - United States + 1037: 'he', # Hebrew + 1081: 'hi', # Hindi + 1038: 'hu', # Hungarian + 1129: None, # TODO: Ibibio - Nigeria + 1039: 'is', # Icelandic + 1136: 'ig', # Igbo - Nigeria + 1057: 'id', # Indonesian + 1117: 'iu', # Inuktitut + 1040: 'it', # Italian - Italy + 2064: 'it', # Italian - Switzerland + 1041: 'ja', # Japanese + 1099: 'kn', # Kannada + 1137: 'kr', # Kanuri - Nigeria + 2144: 'ks', # Kashmiri + 1120: 'ks', # Kashmiri (Arabic) + 1087: 'kk', # Kazakh + 1107: 'km', # Khmer + 1111: 'kok', # Konkani + 1042: 'ko', # Korean + 1088: 'ky', # Kyrgyz (Cyrillic) + 1108: 'lo', # Lao + 1142: 'la', # Latin + 1062: 'lv', # Latvian + 1063: 'lt', # Lithuanian + 1086: 'ms', # Malay - Malaysia + 2110: 'ms', # Malay - Brunei Darussalam + 1100: 'ml', # Malayalam + 1082: 'mt', # Maltese + 1112: 'mni', # Manipuri + 1153: 'mi', # Maori - New Zealand + 1102: 'mr', # Marathi + 1104: 'mn', # Mongolian (Cyrillic) + 2128: 'mn', # Mongolian (Mongolian) + 1121: 'ne', # Nepali + 2145: 'ne', # Nepali - India + 1044: 'no', # Norwegian (Bokmᅢᆬl) + 2068: 'no', # Norwegian (Nynorsk) + 1096: 'or', # Oriya + 1138: 'om', # Oromo + 1145: 'pap', # Papiamentu + 1123: 'ps', # Pashto + 1045: 'pl', # Polish + 1046: 'pt', # Portuguese - Brazil + 2070: 'pt', # Portuguese - Portugal + 1094: 'pa', # Punjabi + 2118: 'pa', # Punjabi (Pakistan) + 1131: 'qu', # Quecha - Bolivia + 2155: 'qu', # Quecha - Ecuador + 3179: 'qu', # Quecha - Peru + 1047: 'rm', # Rhaeto-Romanic + 1048: 'ro', # Romanian + 2072: 'ro', # Romanian - Moldava + 1049: 'ru', # Russian + 2073: 'ru', # Russian - Moldava + 1083: 'se', # Sami (Lappish) + 1103: 'sa', # Sanskrit + 1132: 'nso', # Sepedi + 3098: 'sr', # Serbian (Cyrillic) + 2074: 'sr', # Serbian (Latin) + 1113: 'sd', # Sindhi - India + 2137: 'sd', # Sindhi - Pakistan + 1115: 'si', # Sinhalese - Sri Lanka + 1051: 'sk', # Slovak + 1060: 'sl', # Slovenian + 1143: 'so', # Somali + 1070: 'wen', # Sorbian + 3082: 'es', # Spanish - Spain (Modern Sort) + 1034: 'es', # Spanish - Spain (Traditional Sort) + 11274: 'es', # Spanish - Argentina + 16394: 'es', # Spanish - Bolivia + 13322: 'es', # Spanish - Chile + 9226: 'es', # Spanish - Colombia + 5130: 'es', # Spanish - Costa Rica + 7178: 'es', # Spanish - Dominican Republic + 12298: 'es', # Spanish - Ecuador + 17418: 'es', # Spanish - El Salvador + 4106: 'es', # Spanish - Guatemala + 18442: 'es', # Spanish - Honduras + 58378: 'es', # Spanish - Latin America + 2058: 'es', # Spanish - Mexico + 19466: 'es', # Spanish - Nicaragua + 6154: 'es', # Spanish - Panama + 15370: 'es', # Spanish - Paraguay + 10250: 'es', # Spanish - Peru + 20490: 'es', # Spanish - Puerto Rico + 21514: 'es', # Spanish - United States + 14346: 'es', # Spanish - Uruguay + 8202: 'es', # Spanish - Venezuela + 1072: None, # TODO: Sutu + 1089: 'sw', # Swahili + 1053: 'sv', # Swedish + 2077: 'sv', # Swedish - Finland + 1114: 'syr', # Syriac + 1064: 'tg', # Tajik + 1119: None, # TODO: Tamazight (Arabic) + 2143: None, # TODO: Tamazight (Latin) + 1097: 'ta', # Tamil + 1092: 'tt', # Tatar + 1098: 'te', # Telugu + 1054: 'th', # Thai + 2129: 'bo', # Tibetan - Bhutan + 1105: 'bo', # Tibetan - People's Republic of China + 2163: 'ti', # Tigrigna - Eritrea + 1139: 'ti', # Tigrigna - Ethiopia + 1073: 'ts', # Tsonga + 1074: 'tn', # Tswana + 1055: 'tr', # Turkish + 1090: 'tk', # Turkmen + 1152: 'ug', # Uighur - China + 1058: 'uk', # Ukrainian + 1056: 'ur', # Urdu + 2080: 'ur', # Urdu - India + 2115: 'uz', # Uzbek (Cyrillic) + 1091: 'uz', # Uzbek (Latin) + 1075: 've', # Venda + 1066: 'vi', # Vietnamese + 1106: 'cy', # Welsh + 1076: 'xh', # Xhosa + 1144: 'ii', # Yi + 1085: 'yi', # Yiddish + 1130: 'yo', # Yoruba + 1077: 'zu' # Zulu +} diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 9080377b36..2b5dcca653 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -11,6 +11,7 @@ from lxml.etree import XPath as X DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' +STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', @@ -20,6 +21,7 @@ namespaces = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'w10': 'urn:schemas-microsoft-com:office:word', 'wne': 'http://schemas.microsoft.com/office/word/2006/wordml', + 'xml': 'http://www.w3.org/XML/1998/namespace', # Drawing 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', @@ -45,3 +47,18 @@ namespaces = { def XPath(expr): return X(expr, namespaces=namespaces) +def is_tag(x, q): + tag = getattr(x, 'tag', x) + ns, name = q.partition(':')[0::2] + return '{%s}%s' % (namespaces.get(ns, None), name) == tag + +def barename(x): + return x.rpartition('}')[-1] + +def XML(x): + return '{%s}%s' % (namespaces['xml'], x) + +def get(x, attr, default=None): + ns, name = attr.partition(':')[0::2] + return x.attrib.get('{%s}%s' % (namespaces[ns], name), default) + diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py new file mode 100644 index 0000000000..f88b09bd26 --- /dev/null +++ b/src/calibre/ebooks/docx/styles.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from collections import OrderedDict + +from calibre.ebooks.docx.names import XPath, get + +class Inherit: + pass +inherit = Inherit() + +def binary_property(parent, name): + vals = XPath('./w:%s' % name)(parent) + if not vals: + return inherit + val = get(vals[0], 'w:val', 'on') + return True if val in {'on', '1', 'true'} else False + +def simple_color(col, auto='black'): + if not col or col == 'auto' or len(col) != 6: + return auto + return '#'+col + +def simple_float(val, mult=1.0): + try: + return float(val) * mult + except (ValueError, TypeError, AttributeError, KeyError): + return None + +# Block styles {{{ + +LINE_STYLES = { # {{{ + 'basicBlackDashes': 'dashed', + 'basicBlackDots': 'dotted', + 'basicBlackSquares': 'dashed', + 'basicThinLines': 'solid', + 'dashDotStroked': 'groove', + 'dashed': 'dashed', + 'dashSmallGap': 'dashed', + 'dotDash': 'dashed', + 'dotDotDash': 'dashed', + 'dotted': 'dotted', + 'double': 'double', + 'inset': 'inset', + 'nil': 'none', + 'none': 'none', + 'outset': 'outset', + 'single': 'solid', + 'thick': 'solid', + 'thickThinLargeGap': 'double', + 'thickThinMediumGap': 'double', + 'thickThinSmallGap' : 'double', + 'thinThickLargeGap': 'double', + 'thinThickMediumGap': 'double', + 'thinThickSmallGap': 'double', + 'thinThickThinLargeGap': 'double', + 'thinThickThinMediumGap': 'double', + 'thinThickThinSmallGap': 'double', + 'threeDEmboss': 'ridge', + 'threeDEngrave': 'groove', + 'triple': 'double', +} # }}} + +def read_border(parent, dest): + tvals = {'padding_%s':inherit, 'border_%s_width':inherit, + 'border_%s_style':inherit, 'border_%s_color':inherit} + vals = {} + for edge in ('left', 'top', 'right', 'bottom'): + vals.update({k % edge:v for k, v in tvals.iteritems()}) + + for border in XPath('./w:pBdr')(parent): + for edge in ('left', 'top', 'right', 'bottom'): + for elem in XPath('./w:%s' % edge): + color = get(elem, 'w:color') + if color is not None: + vals['border_%s_color' % edge] = simple_color(color) + style = get(elem, 'w:val') + if style is not None: + vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid') + space = get(elem, 'w:space') + if space is not None: + try: + vals['padding_%s' % edge] = float(space) + except (ValueError, TypeError): + pass + sz = get(elem, 'w:sz') + if sz is not None: + # we dont care about art borders (they are only used for page borders) + try: + vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8 + except (ValueError, TypeError): + pass + + for key, val in vals.iteritems(): + setattr(dest, key, val) + +def read_indent(parent, dest): + padding_left = padding_right = text_indent = inherit + for indent in XPath('./w:ind')(parent): + l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars') + pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None + if pl is not None: + padding_left = '%.3f%s' % (pl, 'em' if lc is not None else 'pt') + + r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars') + pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None + if pr is not None: + padding_right = '%.3f%s' % (pr, 'em' if rc is not None else 'pt') + + h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars') + fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars') + ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else + simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None) + if ti is not None: + text_indent = '%.3f%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt') + + setattr(dest, 'margin_left', padding_left) + setattr(dest, 'margin_right', padding_right) + setattr(dest, 'text_indent', text_indent) + +def read_justification(parent, dest): + ans = inherit + for jc in XPath('./w:jc[@w:val]')(parent): + val = get(jc, 'w:val') + if not val: + continue + if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val: + ans = 'justify' + if val in {'left', 'center', 'right',}: + ans = val + setattr(dest, 'text_align', ans) + +def read_spacing(parent, dest): + padding_top = padding_bottom = line_height = inherit + for s in XPath('./w:spacing')(parent): + a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing') + pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None + if pb is not None: + padding_bottom = '%.3f%s' % (pb, 'ex' if al is not None else 'pt') + + b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing') + pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None + if pt is not None: + padding_top = '%.3f%s' % (pt, 'ex' if bl is not None else 'pt') + + l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto') + if l is not None: + lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0) + line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '') + + setattr(dest, 'margin_top', padding_top) + setattr(dest, 'margin_bottom', padding_bottom) + setattr(dest, 'line_height', line_height) + +def read_direction(parent, dest): + ans = inherit + for jc in XPath('./w:textFlow[@w:val]')(parent): + val = get(jc, 'w:val') + if not val: + continue + if 'rl' in val.lower(): + ans = 'rtl' + setattr(dest, 'direction', ans) + +def read_shd(parent, dest): + ans = inherit + for shd in XPath('./w:shd[@w:fill]')(parent): + val = get(shd, 'w:fill') + if val: + ans = simple_color(val, auto='transparent') + setattr(dest, 'background_color', ans) + +class ParagraphStyle(object): + + all_properties = ( + 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi', + 'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents', + 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', + 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', + + # Border margins padding + 'border_left_width', 'border_left_style', 'border_left_color', 'padding_left', + 'border_top_width', 'border_top_style', 'border_top_color', 'padding_top', + 'border_right_width', 'border_right_style', 'border_right_color', 'padding_right', + 'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom', + 'margin_left', 'margin_top', 'margin_right', 'margin_bottom', + + # Misc. + 'text_indent', 'text_align', 'line_height', 'direction', 'background_color', + ) + + def __init__(self, pPr): + for p in ( + 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi', + 'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents', + 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', + 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', + ): + setattr(self, p, binary_property(pPr, p)) + + for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'): + f = globals()['read_%s' % x] + f(pPr, self) + + # TODO: numPr and outlineLvl + + def update(self, other): + for prop in self.all_properties: + nval = getattr(other, prop) + if nval is not inherit: + setattr(self, prop, nval) + +# }}} + +# Character styles {{{ +def read_text_border(parent, dest): + border_color = border_style = border_width = padding = inherit + elems = XPath('./w:bdr')(parent) + if elems: + border_color = simple_color('auto') + border_style = 'solid' + border_width = 1 + for elem in elems: + color = get(elem, 'w:color') + if color is not None: + border_color = simple_color(color) + style = get(elem, 'w:val') + if style is not None: + border_style = LINE_STYLES.get(style, 'solid') + space = get(elem, 'w:space') + if space is not None: + try: + padding = float(space) + except (ValueError, TypeError): + pass + sz = get(elem, 'w:sz') + if sz is not None: + # we dont care about art borders (they are only used for page borders) + try: + border_width = min(96, max(2, float(sz))) / 8 + except (ValueError, TypeError): + pass + + setattr(dest, 'border_color', border_color) + setattr(dest, 'border_style', border_style) + setattr(dest, 'border_width', border_width) + setattr(dest, 'padding', padding) + +def read_color(parent, dest): + ans = inherit + for col in XPath('./w:color[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + ans = simple_color(val) + setattr(dest, 'color', ans) + +def read_highlight(parent, dest): + ans = inherit + for col in XPath('./w:highlight[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + if not val or val == 'none': + val = 'transparent' + ans = val + setattr(dest, 'highlight', ans) + +def read_lang(parent, dest): + ans = inherit + for col in XPath('./w:lang[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + try: + code = int(val, 16) + except (ValueError, TypeError): + ans = val + else: + from calibre.ebooks.docx.lcid import lcid + val = lcid.get(code, None) + if val: + ans = val + setattr(dest, 'lang', ans) + +def read_letter_spacing(parent, dest): + ans = inherit + for col in XPath('./w:spacing[@w:val]')(parent): + val = simple_float(get(col, 'w:val'), 0.05) + if val: + ans = val + setattr(dest, 'letter_spacing', ans) + +def read_sz(parent, dest): + ans = inherit + for col in XPath('./w:sz[@w:val]')(parent): + val = simple_float(get(col, 'w:val'), 0.5) + if val: + ans = val + setattr(dest, 'font_size', ans) + +def read_underline(parent, dest): + ans = inherit + for col in XPath('./w:u[@w:val]')(parent): + val = get(col, 'w:val') + if val: + ans = 'underline' + setattr(dest, 'text_decoration', ans) + +def read_vert_align(parent, dest): + ans = inherit + for col in XPath('./w:vertAlign[@w:val]')(parent): + val = get(col, 'w:val') + if val and val in {'baseline', 'subscript', 'superscript'}: + ans = val + setattr(dest, 'vert_align', ans) + + +class RunStyle(object): + + all_properties = ( + 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', + 'smallCaps', 'strike', 'vanish', + + 'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background-color', + 'letter_spacing', 'font_size', 'text_decoration', 'vert_align', + ) + + def __init__(self, rPr): + for p in ( + 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', + 'smallCaps', 'strike', 'vanish', + ): + setattr(self, p, binary_property(rPr, p)) + + for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align'): + f = globals()['read_%s' % x] + f(rPr, self) + + def update(self, other): + for prop in self.all_properties: + nval = getattr(other, prop) + if nval is not inherit: + setattr(self, prop, nval) +# }}} + +class Style(object): + + name_path = XPath('./w:name[@w:val]') + based_on_path = XPath('./w:basedOn[@w:val]') + link_path = XPath('./w:link[@w:val]') + + def __init__(self, elem): + self.style_id = get(elem, 'w:styleId') + self.style_type = get(elem, 'w:type') + names = self.name_path(elem) + self.name = get(names[-1], 'w:val') if names else None + based_on = self.based_on_path(elem) + self.based_on = get(based_on[0], 'w:val') if based_on else None + if self.style_type == 'numbering': + self.based_on = None + link = self.link_path(elem) + self.link = get(link[0], 'w:val') if link else None + if self.style_type not in {'paragraph', 'character'}: + self.link = None + + self.paragraph_style = self.character_style = None + + if self.style_type in {'paragraph', 'character'}: + if self.style_type == 'paragraph': + for pPr in XPath('./w:pPr')(elem): + ps = ParagraphStyle(pPr) + if self.paragraph_style is None: + self.paragraph_style = ps + else: + self.paragraph_style.update(ps) + + for rPr in XPath('./w:rPr')(elem): + rs = RunStyle(rPr) + if self.character_style is None: + self.character_style = rs + else: + self.character_style.update(rs) + + +class Styles(object): + + def __init__(self): + self.id_map = OrderedDict() + + def __iter__(self): + for s in self.id_map.itervalues(): + yield s + + def __getitem__(self, key): + return self.id_map[key] + + def __len__(self): + return len(self.id_map) + + def get(self, key, default=None): + return self.id_map.get(key, default) + + def __call__(self, root): + for s in XPath('//w:style')(root): + s = Style(s) + if s.style_id: + self.id_map[s.style_id] = s + + # Nuke based_on, link attributes that refer to missing/incompatible + # styles + for s in self: + bo = s.based_on + if bo is not None: + p = self.get(bo) + if p is None or p.style_type != s.style_type: + s.based_on = None + link = s.link + if link is not None: + p = self.get(link) + if p is None or (s.style_type, p.style_type) not in {('paragraph', 'character'), ('character', 'paragraph')}: + s.link = None + + # TODO: Document defaults (docDefaults) + diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index b2a5de4691..f0e2c6385d 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -9,33 +9,134 @@ __copyright__ = '2013, Kovid Goyal ' import sys, os from lxml import html -from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META) +from lxml.html.builder import ( + HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR) -from calibre.ebooks.docx.container import Container +from calibre.ebooks.docx.container import DOCX, fromstring +from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES +from calibre.ebooks.docx.styles import Styles +from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 + +class Text: + + def __init__(self, elem, attr, buf): + self.elem, self.attr, self.buf = elem, attr, buf + + def add_elem(self, elem): + setattr(self.elem, self.attr, ''.join(self.buf)) + self.elem, self.attr, self.buf = elem, 'tail', [] class Convert(object): def __init__(self, path_or_stream, dest_dir=None, log=None): - self.container = Container(path_or_stream, log=log) - self.log = self.container.log + self.docx = DOCX(path_or_stream, log=log) + self.log = self.docx.log self.dest_dir = dest_dir or os.getcwdu() + self.mi = self.docx.metadata self.body = BODY() + self.styles = Styles() self.html = HTML( HEAD( META(charset='utf-8'), - TITLE('TODO: read from metadata'), + TITLE(self.mi.title or _('Unknown')), LINK(rel='stylesheet', type='text/css', href='docx.css'), ), self.body ) + self.html.text='\n\t' + self.html[0].text='\n\t\t' + self.html[0].tail='\n' + for child in self.html[0]: + child.tail = '\n\t\t' + self.html[0][-1].tail = '\n\t' + self.html[1].text = self.html[1].tail = '\n' + lang = canonicalize_lang(self.mi.language) + if lang and lang != 'und': + lang = lang_as_iso639_1(lang) + if lang: + self.html.set('lang', lang) def __call__(self): + doc = self.docx.document + relationships_by_id, relationships_by_type = self.docx.document_relationships + self.read_styles(relationships_by_type) + for top_level in XPath('/w:document/w:body/*')(doc): + if is_tag(top_level, 'w:p'): + p = self.convert_p(top_level) + self.body.append(p) + elif is_tag(top_level, 'w:tbl'): + pass # TODO: tables + elif is_tag(top_level, 'w:sectPr'): + pass # TODO: Last section properties + else: + self.log.debug('Unknown top-level tag: %s, ignoring' % barename(top_level.tag)) + if len(self.body) > 0: + self.body.text = '\n\t' + for child in self.body: + child.tail = '\n\t' + self.body[-1].tail = '\n' self.write() + def read_styles(self, relationships_by_type): + sname = relationships_by_type.get(STYLES, None) + if sname is None: + name = self.docx.document_name.split('/') + name[-1] = 'styles.xml' + if self.docx.exists(name): + sname = name + if sname is not None: + try: + raw = self.docx.read(sname) + except KeyError: + self.log.warn('Styles %s do not exist' % sname) + else: + self.styles(fromstring(raw)) + def write(self): raw = html.tostring(self.html, encoding='utf-8', doctype='') with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: f.write(raw) + def convert_p(self, p): + dest = P() + for run in XPath('descendant::w:r')(p): + span = self.convert_run(run) + dest.append(span) + + return dest + + def convert_run(self, run): + ans = SPAN() + text = Text(ans, 'text', []) + + for child in run: + if is_tag(child, 'w:t'): + if not child.text: + continue + space = child.get(XML('space'), None) + if space == 'preserve': + text.add_elem(SPAN(child.text, style="whitespace:pre-wrap")) + ans.append(text.elem) + else: + text.buf.append(child.text) + elif is_tag(child, 'w:cr'): + text.add_elem(BR()) + elif is_tag(child, 'w:br'): + typ = child.get('type', None) + if typ in {'column', 'page'}: + br = BR(style='page-break-after:always') + else: + clear = child.get('clear', None) + if clear in {'all', 'left', 'right'}: + br = BR(style='clear:%s'%('both' if clear == 'all' else clear)) + else: + br = BR() + text.add_elem(br) + if text.buf: + setattr(text.elem, text.attr, ''.join(text.buf)) + return ans + if __name__ == '__main__': - Convert(sys.argv[-1])() + from calibre.utils.logging import default_log + default_log.filter_level = default_log.DEBUG + Convert(sys.argv[-1], log=default_log)() diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index 3a532ec2c0..5fcde65ff5 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -187,7 +187,7 @@ def _config(): # {{{ c.add_opt('shortcuts_search_history', default=[], help='Search history for the keyboard preferences') c.add_opt('jobs_search_history', default=[], - help='Search history for the keyboard preferences') + help='Search history for the tweaks preferences') c.add_opt('tweaks_search_history', default=[], help='Search history for tweaks') c.add_opt('worker_limit', default=6, diff --git a/src/calibre/gui2/actions/choose_library.py b/src/calibre/gui2/actions/choose_library.py index d38c16ddf3..0d096d66bd 100644 --- a/src/calibre/gui2/actions/choose_library.py +++ b/src/calibre/gui2/actions/choose_library.py @@ -116,11 +116,12 @@ class MovedDialog(QDialog): # {{{ self.cd.setIcon(QIcon(I('document_open.png'))) self.cd.clicked.connect(self.choose_dir) l.addWidget(self.cd, 2, 1, 1, 1) - self.bb = QDialogButtonBox(self) + self.bb = QDialogButtonBox(QDialogButtonBox.Abort) b = self.bb.addButton(_('Library moved'), self.bb.AcceptRole) b.setIcon(QIcon(I('ok.png'))) b = self.bb.addButton(_('Forget library'), self.bb.RejectRole) b.setIcon(QIcon(I('edit-clear.png'))) + b.clicked.connect(self.forget_library) self.bb.accepted.connect(self.accept) self.bb.rejected.connect(self.reject) l.addWidget(self.bb, 3, 0, 1, ncols) @@ -132,9 +133,8 @@ class MovedDialog(QDialog): # {{{ if d is not None: self.loc.setText(d) - def reject(self): + def forget_library(self): self.stats.remove(self.location) - QDialog.reject(self) def accept(self): newloc = unicode(self.loc.text()) diff --git a/src/calibre/gui2/search_restriction_mixin.py b/src/calibre/gui2/search_restriction_mixin.py index c6965aaa6c..7b9fb21340 100644 --- a/src/calibre/gui2/search_restriction_mixin.py +++ b/src/calibre/gui2/search_restriction_mixin.py @@ -12,6 +12,7 @@ from PyQt4.Qt import ( from calibre.gui2 import error_dialog, question_dialog from calibre.gui2.widgets import ComboBoxWithHelp +from calibre.utils.config_base import tweaks from calibre.utils.icu import sort_key from calibre.utils.search_query_parser import ParseException from calibre.utils.search_query_parser import saved_searches @@ -576,8 +577,9 @@ class SearchRestrictionMixin(object): rows = self.current_view().row_count() rbc = max(rows, db.data.get_search_restriction_book_count()) t = _("({0} of {1})").format(rows, rbc) - self.search_count.setStyleSheet( - 'QLabel { border-radius: 8px; background-color: yellow; }') + if tweaks['highlight_virtual_library_book_count']: + self.search_count.setStyleSheet( + 'QLabel { border-radius: 8px; background-color: yellow; }') else: # No restriction or not library view if not self.search.in_a_search(): t = _("(all books)") diff --git a/src/calibre/gui2/store/stores/koobe_plugin.py b/src/calibre/gui2/store/stores/koobe_plugin.py index 208592a827..b6ecdea6be 100644 --- a/src/calibre/gui2/store/stores/koobe_plugin.py +++ b/src/calibre/gui2/store/stores/koobe_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (division, absolute_import, print_function) -store_version = 1 # Needed for dynamic plugin loading +store_version = 2 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2013, Tomasz Długosz ' @@ -25,19 +25,21 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog class KoobeStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): - aff_root = 'https://www.a4b-tracking.com/pl/stat-click-text-link/15/58/' + #aff_root = 'https://www.a4b-tracking.com/pl/stat-click-text-link/15/58/' url = 'http://www.koobe.pl/' - aff_url = aff_root + str(b64encode(url)) + #aff_url = aff_root + str(b64encode(url)) detail_url = None if detail_item: - detail_url = aff_root + str(b64encode(detail_item)) + detail_url = detail_item #aff_root + str(b64encode(detail_item)) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + #open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) else: - d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) + #d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) + d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else url) d.setWindowTitle(self.name) d.set_tags(self.config.get('tags', '')) d.exec_() diff --git a/src/calibre/gui2/store/stores/woblink_plugin.py b/src/calibre/gui2/store/stores/woblink_plugin.py index 596bb76199..6434488b21 100644 --- a/src/calibre/gui2/store/stores/woblink_plugin.py +++ b/src/calibre/gui2/store/stores/woblink_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 2 # Needed for dynamic plugin loading +store_version = 3 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011-2013, Tomasz Długosz ' @@ -26,19 +26,21 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog class WoblinkStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): - aff_root = 'https://www.a4b-tracking.com/pl/stat-click-text-link/16/58/' + #aff_root = 'https://www.a4b-tracking.com/pl/stat-click-text-link/16/58/' url = 'http://woblink.com/publication' - aff_url = aff_root + str(b64encode(url)) + #aff_url = aff_root + str(b64encode(url)) detail_url = None if detail_item: - detail_url = aff_root + str(b64encode('http://woblink.com' + detail_item)) + detail_url = 'http://woblink.com' + detail_item #aff_root + str(b64encode('http://woblink.com' + detail_item)) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + #open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) else: - d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) + #d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) + d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else url) d.setWindowTitle(self.name) d.set_tags(self.config.get('tags', '')) d.exec_() diff --git a/src/calibre/gui2/viewer/config.py b/src/calibre/gui2/viewer/config.py index a744a2c1f9..4132149842 100644 --- a/src/calibre/gui2/viewer/config.py +++ b/src/calibre/gui2/viewer/config.py @@ -139,12 +139,20 @@ class ConfigDialog(QDialog, Ui_Dialog): self.load_options(opts) self.init_load_themes() + self.clear_search_history_button.clicked.connect(self.clear_search_history) + + def clear_search_history(self): + from calibre.gui2 import config + config['viewer_search_history'] = [] + def save_theme(self): themename, ok = QInputDialog.getText(self, _('Theme name'), _('Choose a name for this theme')) - if not ok: return + if not ok: + return themename = unicode(themename).strip() - if not themename: return + if not themename: + return c = config('') c.add_opt('theme_name_xxx', default=themename) self.save_options(c) @@ -247,7 +255,8 @@ class ConfigDialog(QDialog, Ui_Dialog): def update_sample_colors(self): for x in ('text', 'background'): val = getattr(self, 'current_%s_color'%x) - if not val: val = 'inherit' if x == 'text' else 'transparent' + if not val: + val = 'inherit' if x == 'text' else 'transparent' ss = 'QLabel { %s: %s }'%('background-color' if x == 'background' else 'color', val) getattr(self, '%s_color_sample'%x).setStyleSheet(ss) diff --git a/src/calibre/gui2/viewer/config.ui b/src/calibre/gui2/viewer/config.ui index b146b571a2..1dce8e1a9e 100644 --- a/src/calibre/gui2/viewer/config.ui +++ b/src/calibre/gui2/viewer/config.ui @@ -68,7 +68,7 @@ QToolBox::tab:hover { 0 0 811 - 352 + 380 @@ -240,8 +240,8 @@ QToolBox::tab:hover { 0 0 - 397 - 232 + 811 + 380 @@ -370,8 +370,8 @@ QToolBox::tab:hover { 0 0 - 352 - 176 + 811 + 380 @@ -446,8 +446,8 @@ QToolBox::tab:hover { 0 0 - 351 - 76 + 811 + 380 @@ -525,8 +525,8 @@ QToolBox::tab:hover { 0 0 - 410 - 120 + 811 + 380 @@ -596,8 +596,8 @@ QToolBox::tab:hover { 0 0 - 352 - 151 + 811 + 380 @@ -628,27 +628,34 @@ QToolBox::tab:hover { - + + + + Clear search history + + + + + + + Show &controls in the viewer window + + + + Remember last used &window size and layout - + Remember the &current page when quitting - - - - Show &controls in the viewer window - - - diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index 4587a6542b..3b63d51c15 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -940,6 +940,9 @@ class EbookViewer(MainWindow, Ui_EbookViewer): def do_config(self): self.view.config(self) self.load_theme_menu() + from calibre.gui2 import config + if not config['viewer_search_history']: + self.search.clear_history() def bookmark(self, *args): num = 1