mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Support Dublin Core id tags when importing HTML
All of the below formats are supported and would be interpreted as "foo:bar" <meta name="DC.identifier" scheme="foo" content="bar" /> <meta name="dc:identifier.foo" content="bar/> <meta name="DCTERMS:identifier" scheme="foo" content="bar" /> <meta name="dcterms.identifier.foo" content="bar" />
This commit is contained in:
		
							parent
							
								
									c1663d3cc8
								
							
						
					
					
						commit
						9c846f3b51
					
				@ -10,6 +10,8 @@ Try to read metadata from an HTML file.
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from HTMLParser import HTMLParser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from calibre.ebooks.metadata import string_to_authors
 | 
					from calibre.ebooks.metadata import string_to_authors
 | 
				
			||||||
from calibre.ebooks.metadata.book.base import Metadata
 | 
					from calibre.ebooks.metadata.book.base import Metadata
 | 
				
			||||||
from calibre.ebooks.chardet import xml_to_unicode
 | 
					from calibre.ebooks.chardet import xml_to_unicode
 | 
				
			||||||
@ -87,6 +89,26 @@ def parse_meta_tags(src):
 | 
				
			|||||||
                return ans
 | 
					                return ans
 | 
				
			||||||
    return ans
 | 
					    return ans
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_meta_tag_identifiers(src):
 | 
				
			||||||
 | 
					    meta_identifiers = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class MetadataParser(HTMLParser):
 | 
				
			||||||
 | 
					        def handle_starttag(self, tag, attrs):
 | 
				
			||||||
 | 
					            attr_dict = dict(attrs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if tag == 'meta' and re.match(r'(?:dc|dcterms)[\.:]identifier', attr_dict.get('name', ''), flags=re.IGNORECASE):
 | 
				
			||||||
 | 
					                content = attr_dict.get('content', '').strip()
 | 
				
			||||||
 | 
					                scheme = attr_dict.get('scheme', '').strip()
 | 
				
			||||||
 | 
					                if not scheme:
 | 
				
			||||||
 | 
					                    elements = re.split(r'[\.:]', attr_dict['name'])
 | 
				
			||||||
 | 
					                    if len(elements) == 3:
 | 
				
			||||||
 | 
					                        scheme = elements[2]
 | 
				
			||||||
 | 
					                if content and scheme:
 | 
				
			||||||
 | 
					                    meta_identifiers[scheme.lower()] = replace_entities(content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    MetadataParser().feed(src)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return meta_identifiers
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def parse_comment_tags(src):
 | 
					def parse_comment_tags(src):
 | 
				
			||||||
    all_names = '|'.join(itervalues(COMMENT_NAMES))
 | 
					    all_names = '|'.join(itervalues(COMMENT_NAMES))
 | 
				
			||||||
@ -113,6 +135,7 @@ def get_metadata_(src, encoding=None):
 | 
				
			|||||||
    src = src[:150000]  # Searching shouldn't take too long
 | 
					    src = src[:150000]  # Searching shouldn't take too long
 | 
				
			||||||
    comment_tags = parse_comment_tags(src)
 | 
					    comment_tags = parse_comment_tags(src)
 | 
				
			||||||
    meta_tags = parse_meta_tags(src)
 | 
					    meta_tags = parse_meta_tags(src)
 | 
				
			||||||
 | 
					    meta_tag_ids = parse_meta_tag_identifiers(src)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get(field):
 | 
					    def get(field):
 | 
				
			||||||
        ans = comment_tags.get(field, meta_tags.get(field, None))
 | 
					        ans = comment_tags.get(field, meta_tags.get(field, None))
 | 
				
			||||||
@ -193,4 +216,8 @@ def get_metadata_(src, encoding=None):
 | 
				
			|||||||
        if tags:
 | 
					        if tags:
 | 
				
			||||||
            mi.tags = tags
 | 
					            mi.tags = tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # IDENTIFIERS
 | 
				
			||||||
 | 
					    for (k,v) in meta_tag_ids.iteritems():
 | 
				
			||||||
 | 
					        mi.set_identifier(k, v)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return mi
 | 
					    return mi
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user