diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe deleted file mode 100644 index e29bfe3dde..0000000000 --- a/resources/recipes/wsj_free.recipe +++ /dev/null @@ -1,314 +0,0 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' - -''' -online.wsj.com -''' -import re -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag, NavigableString -from datetime import timedelta, date - -class WSJ(BasicNewsRecipe): - # formatting adapted from original recipe by Kovid Goyal and Sujata Raman - title = u'Wall Street Journal (free)' - __author__ = 'Nick Redding' - language = 'en' - description = ('All the free content from the Wall Street Journal (business, financial and political news)') - - no_stylesheets = True - timefmt = ' [%b %d]' - - # customization notes: delete sections you are not interested in - # set omit_paid_content to False if you want the paid content article snippets - # set oldest_article to the maximum number of days back from today to include articles - sectionlist = [ - ['/home-page','Front Page'], - ['/public/page/news-opinion-commentary.html','Commentary'], - ['/public/page/news-global-world.html','World News'], - ['/public/page/news-world-business.html','US News'], - ['/public/page/news-business-us.html','Business'], - ['/public/page/news-financial-markets-stock.html','Markets'], - ['/public/page/news-tech-technology.html','Technology'], - ['/public/page/news-personal-finance.html','Personal Finnce'], - ['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'], - ['/public/page/news-real-estate-homes.html','Real Estate'], - ['/public/page/news-career-jobs.html','Careers'], - ['/public/page/news-small-business-marketing.html','Small Business'] - ] - oldest_article = 2 - omit_paid_content = True - - extra_css = '''h1{font-size:large; font-family:Times,serif;} - h2{font-family:Times,serif; font-size:small; font-style:italic;} - .subhead{font-family:Times,serif; font-size:small; font-style:italic;} - .insettipUnit {font-family:Times,serif;font-size:xx-small;} - .targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;} - .article{font-family:Times,serif; font-size:x-small;} - .tagline { font-size:xx-small;} - .dateStamp {font-family:Times,serif;} - h3{font-family:Times,serif; font-size:xx-small;} - .byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;} - .metadataType-articleCredits {list-style-type: none;} - h6{font-family:Times,serif; font-size:small; font-style:italic;} - .paperLocation{font-size:xx-small;}''' - - - remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')}) - remove_tags = [ dict({'id':re.compile('^articleTabs_tab_')}), - #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", - # "articleTabs_tab_interactive","articleTabs_tab_video", - # "articleTabs_tab_map","articleTabs_tab_slideshow"]), - {'class': ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', - 'insettip','insetClose','more_in', "insetContent", - # 'articleTools_bottom','articleTools_bottom mjArticleTools', - 'aTools', 'tooltip', - 'adSummary', 'nav-inline','insetFullBracket']}, - dict({'class':re.compile('^articleTools_bottom')}), - dict(rel='shortcut icon') - ] - remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}] - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - return br - - - def preprocess_html(self,soup): - - def decode_us_date(datestr): - udate = datestr.strip().lower().split() - m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1 - d = int(udate[1]) - y = int(udate[2]) - return date(y,m,d) - - # check if article is paid content - if self.omit_paid_content: - divtags = soup.findAll('div','tooltip') - if divtags: - for divtag in divtags: - if divtag.find(text="Subscriber Content"): - return None - - # check if article is too old - datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")}) - if datetag: - dateline_string = self.tag_to_string(datetag,False) - date_items = dateline_string.split(',') - datestring = date_items[0]+date_items[1] - article_date = decode_us_date(datestring) - earliest_date = date.today() - timedelta(days=self.oldest_article) - if article_date < earliest_date: - self.log("Skipping article dated %s" % datestring) - return None - datetag.parent.extract() - - # place dateline in article heading - - bylinetag = soup.find('h3','byline') - if bylinetag: - h3bylinetag = bylinetag - else: - bylinetag = soup.find('li','byline') - if bylinetag: - h3bylinetag = bylinetag.h3 - if not h3bylinetag: - h3bylinetag = bylinetag - bylinetag = bylinetag.parent - if bylinetag: - if h3bylinetag.a: - bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False) - else: - bylinetext = self.tag_to_string(h3bylinetag,False) - h3byline = Tag(soup,'h3',[('class','byline')]) - if bylinetext.isspace() or (bylinetext == ''): - h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1])) - else: - h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1])) - bylinetag.replaceWith(h3byline) - else: - headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")}) - if headlinetag: - dateline = Tag(soup,'h3', [('class','byline')]) - dateline.insert(0,NavigableString(date_items[0]+','+date_items[1])) - headlinetag.insert(len(headlinetag),dateline) - else: # if no date tag, don't process this page--it's not a news item - return None - # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines - ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'}) - if ultag: - a = ultag.h3 - if a: - ultag.replaceWith(a) - return soup - - def parse_index(self): - - articles = {} - key = None - ans = [] - - def parse_index_page(page_name,page_title): - - def article_title(tag): - atag = tag.find('h2') # title is usually in an h2 tag - if not atag: # if not, get text from the a tag - atag = tag.find('a',href=True) - if not atag: - return '' - t = self.tag_to_string(atag,False) - if t == '': - # sometimes the title is in the second a tag - atag.extract() - atag = tag.find('a',href=True) - if not atag: - return '' - return self.tag_to_string(atag,False) - return t - return self.tag_to_string(atag,False) - - def article_author(tag): - atag = tag.find('strong') # author is usually in a strong tag - if not atag: - atag = tag.find('h4') # if not, look for an h4 tag - if not atag: - return '' - return self.tag_to_string(atag,False) - - def article_summary(tag): - atag = tag.find('p') - if not atag: - return '' - subtag = atag.strong - if subtag: - subtag.extract() - return self.tag_to_string(atag,False) - - def article_url(tag): - atag = tag.find('a',href=True) - if not atag: - return '' - url = re.sub(r'\?.*', '', atag['href']) - return url - - def handle_section_name(tag): - # turns a tag into a section name with special processing - # for Wat's News, U.S., World & U.S. and World - s = self.tag_to_string(tag,False) - if ("What" in s) and ("News" in s): - s = "What's News" - elif (s == "U.S.") or (s == "World & U.S.") or (s == "World"): - s = s + " News" - return s - - - - mainurl = 'http://online.wsj.com' - pageurl = mainurl+page_name - #self.log("Page url %s" % pageurl) - soup = self.index_to_soup(pageurl) - # Find each instance of div with class including "headlineSummary" - for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}): - # divtag contains all article data as ul's and li's - # first, check if there is an h3 tag which provides a section name - stag = divtag.find('h3') - if stag: - if stag.parent.get('class', '') == 'dynamic': - # a carousel of articles is too complex to extract a section name - # for each article, so we'll just call the section "Carousel" - section_name = 'Carousel' - else: - section_name = handle_section_name(stag) - else: - section_name = "What's News" - #self.log("div Section %s" % section_name) - # find each top-level ul in the div - # we don't restrict to class = newsItem because the section_name - # sometimes changes via a ul tag inside the div - for ultag in divtag.findAll('ul',recursive=False): - stag = ultag.find('h3') - if stag: - if stag.parent.name == 'ul': - # section name has changed - section_name = handle_section_name(stag) - #self.log("ul Section %s" % section_name) - # delete the h3 tag so it doesn't get in the way - stag.extract() - # find each top level li in the ul - for litag in ultag.findAll('li',recursive=False): - stag = litag.find('h3') - if stag: - # section name has changed - section_name = handle_section_name(stag) - #self.log("li Section %s" % section_name) - # delete the h3 tag so it doesn't get in the way - stag.extract() - # if there is a ul tag inside the li it is superfluous; - # it is probably a list of related articles - utag = litag.find('ul') - if utag: - utag.extract() - # now skip paid subscriber articles if desired - subscriber_tag = litag.find(text="Subscriber Content") - if subscriber_tag: - if self.omit_paid_content: - continue - # delete the tip div so it doesn't get in the way - tiptag = litag.find("div", { "class" : "tipTargetBox" }) - if tiptag: - tiptag.extract() - h1tag = litag.h1 - # if there's an h1 tag, it's parent is a div which should replace - # the li tag for the analysis - if h1tag: - litag = h1tag.parent - h5tag = litag.h5 - if h5tag: - # section mame has changed - section_name = self.tag_to_string(h5tag,False) - #self.log("h5 Section %s" % section_name) - # delete the h5 tag so it doesn't get in the way - h5tag.extract() - url = article_url(litag) - if url == '': - continue - if url.startswith("/article"): - url = mainurl+url - if not url.startswith("http://online.wsj.com"): - continue - if not url.endswith(".html"): - continue - if 'video' in url: - continue - title = article_title(litag) - if title == '': - continue - #self.log("URL %s" % url) - #self.log("Title %s" % title) - pubdate = '' - #self.log("Date %s" % pubdate) - author = article_author(litag) - if author == '': - author = section_name - elif author == section_name: - author = '' - else: - author = section_name+': '+author - #if not author == '': - # self.log("Author %s" % author) - description = article_summary(litag) - #if not description == '': - # self.log("Description %s" % description) - if not articles.has_key(page_title): - articles[page_title] = [] - articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - - - for page_name,page_title in self.sectionlist: - parse_index_page(page_name,page_title) - ans.append(page_title) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - return ans diff --git a/src/calibre/gui2/convert/single.ui b/src/calibre/gui2/convert/single.ui index 290b31aaec..5b976a5cfb 100644 --- a/src/calibre/gui2/convert/single.ui +++ b/src/calibre/gui2/convert/single.ui @@ -31,7 +31,14 @@ - + + + QComboBox::AdjustToMinimumContentsLengthWithIcon + + + 5 + + @@ -64,7 +71,14 @@ - + + + QComboBox::AdjustToMinimumContentsLengthWithIcon + + + 5 + + @@ -115,8 +129,8 @@ 0 0 - 810 - 489 + 805 + 484 diff --git a/src/calibre/libunrar.py b/src/calibre/libunrar.py index bf38a47d64..a71fd8718a 100644 --- a/src/calibre/libunrar.py +++ b/src/calibre/libunrar.py @@ -177,7 +177,7 @@ def extract(path, dir): try: if open_archive_data.OpenResult != 0: raise UnRARException(_interpret_open_error(open_archive_data.OpenResult, path)) - prints('Archive:', path) + #prints('Archive:', path) #print get_archive_info(open_archive_data.Flags) header_data = RARHeaderDataEx(CmtBuf=None) #_libunrar.RARSetCallback(arc_data, callback_func, mode)