From f1d81044505a97529bdf36cbd02d29c1f04bfdbb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 23 Jan 2010 14:55:28 -0700 Subject: [PATCH] New recipe for Neowin by Darko Miletic --- resources/images/news/neowin.png | Bin 0 -> 1068 bytes resources/recipes/neowin.recipe | 40 +++++++++++ src/calibre/ebooks/pdf/reflow.py | 110 ++++++++++++++++++------------- 3 files changed, 104 insertions(+), 46 deletions(-) create mode 100644 resources/images/news/neowin.png create mode 100644 resources/recipes/neowin.recipe diff --git a/resources/images/news/neowin.png b/resources/images/news/neowin.png new file mode 100644 index 0000000000000000000000000000000000000000..5aee949c0bb7e844b4b47169f3302776087be6d4 GIT binary patch literal 1068 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87??vnT^vI!PRCA;&WJ8$I9C6^*z=sH-(1fUzd#8k2CH1H%;KsUTcM=R~q%}XP@tvtN$zAQ!wLL1+!D=_hYvF*9$jn&ux33 z{iyk}`|;q;7b^sk!Wj#+?^*9&XKqztaP+3JzR=5ePPsh`+m%;2($ zPUFY^SVeuj$3)Di`U7CcHNiHmw)`SQO@Z5_KQ<;R8m?mI4inn zoLcy=Y~7hVAz@+W@t4=dMOXQ(I%vyWC+jNM;I&`XBh4pGDA(t&;G<(somUU>z2&uP zo+-BPucMLec9S)!;@xl4G#GwZOqKc{A~H*NY73tsqjF*EvV|65cT%2Jbt-E*-nekl zWa>}*>%viMvEkI!KYY?NEtWP)v`e${GA#-bOcKBS(eaAr^w@7()vs-R zSCUbrSe^ai)*q>*F`R;i1>0BOn61~k==388o2ff@ zi`~upiF41Sg`4JA&+cL~UEdeJHgMbOC!frM&$6liikT!v9oemfXc5Pa|{axURe4{(*pYF|8&Y1OU^ZVcDmtLN0 zU4A|!Cf-!`x_$9Kr*BV=ES(dj^Y)Q9XG4=p5^F`thR}IcKO6mf7kqLvntAKzdFl5{ zUfpWvmYyoS|Nq{P9_tde{|f5b_Uyy0h0GgY_KAJ|+>q#d@AQ9e!=-83#i!gAfw@h! z#5JNMC9x#cD!C{XNHG{07#ipr80s1shZveznHpIcTId>>TNxOf*)TH!MMG|WN@iLm fiUw0F6H6;26Nm-@tBpT_8W=oX{an^LB{Ts5fE3tc literal 0 HcmV?d00001 diff --git a/resources/recipes/neowin.recipe b/resources/recipes/neowin.recipe new file mode 100644 index 0000000000..9f5a669a75 --- /dev/null +++ b/resources/recipes/neowin.recipe @@ -0,0 +1,40 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class Neowin(BasicNewsRecipe): + title = u'Neowin.net' + oldest_article = 5 + language = 'en' + description = 'News from IT' + publisher = 'Neowin' + category = 'news, IT, Microsoft, Apple, hardware, software, games' + __author__ = 'Darko Miletic' + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf8' + + conversion_options = { + 'tags' : category + ,'language' : language + ,'comments' : description + ,'publisher' : publisher + } + + keep_only_tags = [dict(name='div', attrs={'id':'article'})] + remove_tags_after = dict(name='div', attrs={'id':'tag-bar'}) + + remove_tags = [ + dict(name=['base','object','link','iframe']) + ,dict(name='div', attrs={'id':'tag-bar'}) + ] + + feeds = [ + (u'Software' , u'http://www.neowin.net/news/rss/software' ) + ,(u'Gaming' , u'http://www.neowin.net/news/rss/gaming' ) + ,(u'Microsoft', u'http://www.neowin.net/news/rss/microsoft') + ,(u'Apple' , u'http://www.neowin.net/news/rss/apple' ) + ,(u'Editorial', u'http://www.neowin.net/news/rss/editorial') + ] + def image_url_processor(cls, baseurl, url): + return url + diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 73178f5621..1a0e5e0dcb 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -18,48 +18,11 @@ class Font(object): self.color = spec.get('color') self.family = spec.get('family') -class Column(object): - - # A column contains an element is the element bulges out to - # the left or the right by at most HFUZZ*col width. - HFUZZ = 0.2 - - def __init__(self): - self.left = self.right = self.top = self.bottom = 0 - self.width = self.height = 0 - self.elements = [] - self.average_line_separation = 0 - - def add(self, elem): - if elem in self.elements: return - self.elements.append(elem) - self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom)) - self.top = self.elements[0].top - self.bottom = self.elements[-1].bottom - self.left, self.right = sys.maxint, 0 - for x in self: - self.left = min(self.left, x.left) - self.right = max(self.right, x.right) - self.width, self.height = self.right-self.left, self.bottom-self.top - - def __iter__(self): - for x in self.elements: - yield x - - def contains(self, elem): - return elem.left > self.left - self.HFUZZ*self.width and \ - elem.right < self.right + self.HFUZZ*self.width - - def collect_stats(self): - if len(self.elements) > 1: - gaps = [self.elements[i+1].top - self.elements[i].bottom for i in - range(len(0, len(self.elements)-1))] - self.average_line_separation = sum(gaps)/len(gaps) - class Element(object): def __init__(self): - self.starts_paragraph = False + self.starts_block = None + self.block_style = None def __eq__(self, other): return self.id == other.id @@ -152,6 +115,61 @@ class Interval(object): def __hash__(self): return hash('(%f,%f)'%self.left, self.right) +class Column(object): + + # A column contains an element is the element bulges out to + # the left or the right by at most HFUZZ*col width. + HFUZZ = 0.2 + + + def __init__(self): + self.left = self.right = self.top = self.bottom = 0 + self.width = self.height = 0 + self.elements = [] + self.average_line_separation = 0 + + def add(self, elem): + if elem in self.elements: return + self.elements.append(elem) + self.elements.sort(cmp=lambda x,y:cmp(x.bottom,y.bottom)) + self.top = self.elements[0].top + self.bottom = self.elements[-1].bottom + self.left, self.right = sys.maxint, 0 + for x in self: + self.left = min(self.left, x.left) + self.right = max(self.right, x.right) + self.width, self.height = self.right-self.left, self.bottom-self.top + + def __iter__(self): + for x in self.elements: + yield x + + def contains(self, elem): + return elem.left > self.left - self.HFUZZ*self.width and \ + elem.right < self.right + self.HFUZZ*self.width + + def collect_stats(self): + if len(self.elements) > 1: + gaps = [self.elements[i+1].top - self.elements[i].bottom for i in + range(len(0, len(self.elements)-1))] + self.average_line_separation = sum(gaps)/len(gaps) + for i, elem in enumerate(self.elements): + left_margin = elem.left - self.left + elem.indent_fraction = left_margin/self.width + elem.width_fraction = elem.width/self.width + if i == 0: + elem.top_gap = None + else: + elem.top_gap = self.elements[i-1].bottom - elem.top + + def previous_element(self, idx): + if idx == 0: + return None + return self.elements[idx-1] + + + + class Region(object): def __init__(self): @@ -168,6 +186,7 @@ class Region(object): self.columns[i].add(elem) def contains(self, columns): + # TODO: handle unbalanced columns if not self.columns: return True if len(columns) != len(self.columns): @@ -187,7 +206,7 @@ class Region(object): return len(self.elements) == 0 def collect_stats(self): - for column in self.column: + for column in self.columns: column.collect_stats() self.average_line_separation = sum([x.average_line_separation for x in self.columns])/float(len(self.columns)) @@ -196,11 +215,10 @@ class Region(object): for x in self.columns: yield x - def detect_paragraphs(self): - first = True - for col in self: - col.detect_paragraphs(self.average_line_separation, first) - first = False + def linearize(self): + self.elements = [] + for x in self.columns: + self.elements.extend(x) class Page(object): @@ -332,7 +350,7 @@ class Page(object): 'Locate paragraph boundaries in each column' for region in self.regions: region.collect_stats() - region.detect_paragraphs() + region.linearize() class PDFDocument(object):