mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:Add XPath tutorial
This commit is contained in:
parent
416f49f4c4
commit
32042325b9
@ -37,10 +37,10 @@ def config(defaults=None):
|
|||||||
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
|
structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
|
||||||
help=_('''\
|
help=_('''\
|
||||||
An XPath expression to detect chapter titles. The default is to consider <h1> or
|
An XPath expression to detect chapter titles. The default is to consider <h1> or
|
||||||
<h2> tags that contain the text "chapter" or "book" or "section" as chapter titles. This
|
<h2> tags that contain the text "chapter" or "book" or "section" as chapter titles.
|
||||||
is achieved by the expression: "//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]"
|
|
||||||
The expression used must evaluate to a list of elements. To disable chapter detection,
|
The expression used must evaluate to a list of elements. To disable chapter detection,
|
||||||
use the expression "/".
|
use the expression "/". See the XPath Tutorial in the calibre User Manual for further
|
||||||
|
help on using this feature.
|
||||||
''').replace('\n', ' '))
|
''').replace('\n', ' '))
|
||||||
structure('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
|
structure('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False,
|
||||||
help=_('Don\'t add detected chapters to the Table of Contents'))
|
help=_('Don\'t add detected chapters to the Table of Contents'))
|
||||||
|
@ -87,7 +87,7 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
|
|||||||
def rewrite_links(self, olink):
|
def rewrite_links(self, olink):
|
||||||
'''
|
'''
|
||||||
Make all links in document relative so that they work in the EPUB container.
|
Make all links in document relative so that they work in the EPUB container.
|
||||||
Also copies any resources (like image, stylesheets, scripts, etc.) into
|
Also copies any resources (like images, stylesheets, scripts, etc.) into
|
||||||
the local tree.
|
the local tree.
|
||||||
'''
|
'''
|
||||||
if not isinstance(olink, unicode):
|
if not isinstance(olink, unicode):
|
||||||
@ -103,7 +103,7 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
|
|||||||
name, ext = os.path.splitext(name)
|
name, ext = os.path.splitext(name)
|
||||||
name += ('_%d'%len(self.resource_map)) + ext
|
name += ('_%d'%len(self.resource_map)) + ext
|
||||||
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
|
shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
|
||||||
name = 'resources/'+name
|
name = 'resources/' + name
|
||||||
self.resource_map[link.path] = name
|
self.resource_map[link.path] = name
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
@ -95,7 +95,7 @@ $desc
|
|||||||
#end
|
#end
|
||||||
#for opt in options
|
#for opt in options
|
||||||
${option(opt)}
|
${option(opt)}
|
||||||
${opt.help.replace('\n', ' ').replace('%default', str(opt.default)) if opt.help else ''}
|
${opt.help.replace('\n', ' ').replace('*', '\\*').replace('%default', str(opt.default)) if opt.help else ''}
|
||||||
||
|
||
|
||||||
#end
|
#end
|
||||||
#end
|
#end
|
||||||
|
@ -27,4 +27,4 @@ Glossary
|
|||||||
**URL** *(Uniform Resource Locator)* for example: ``http://example.com``
|
**URL** *(Uniform Resource Locator)* for example: ``http://example.com``
|
||||||
|
|
||||||
regexp
|
regexp
|
||||||
**Regular expressions** provide a concise and flexible means for identifying strings of text of interest, such as particular characters, words, or patterns of characters. See http://docs.python.org/lib/re-syntax.html for the syntax of regular expressions used in python.
|
**Regular expressions** provide a concise and flexible means for identifying strings of text of interest, such as particular characters, words, or patterns of characters. See `regexp syntax <http://docs.python.org/lib/re-syntax.html>`_ for the syntax of regular expressions used in python.
|
||||||
|
@ -29,6 +29,7 @@ Sections
|
|||||||
conversion
|
conversion
|
||||||
metadata
|
metadata
|
||||||
faq
|
faq
|
||||||
|
xpath
|
||||||
glossary
|
glossary
|
||||||
|
|
||||||
Convenience
|
Convenience
|
||||||
|
108
src/calibre/manual/xpath.rst
Normal file
108
src/calibre/manual/xpath.rst
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
.. include:: global.rst
|
||||||
|
|
||||||
|
.. _xpath-tutorial:
|
||||||
|
|
||||||
|
XPath Tutorial
|
||||||
|
==============
|
||||||
|
|
||||||
|
In this tutorial, you will be given a gentle introduction to
|
||||||
|
`XPath <http://en.wikipedia.org/wiki/XPath>`_, a query language that can be
|
||||||
|
used to select arbitrary parts of `HTML <http://en.wikipedia.org/wiki/HTML>`_
|
||||||
|
documents in |app|. XPath is a widely
|
||||||
|
used standard, and googling it will yield a ton of information. This tutorial,
|
||||||
|
however, focuses on using XPath for ebook related tasks like finding chapter
|
||||||
|
headings in an unstructured HTML document.
|
||||||
|
|
||||||
|
.. contents:: Contents
|
||||||
|
:depth: 1
|
||||||
|
:local:
|
||||||
|
|
||||||
|
Selecting by tagname
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
The simplest form of selection is to select tags by name. For example,
|
||||||
|
suppose you want to select all the ``<h2>`` tags in a document. The XPath
|
||||||
|
query for this is simply::
|
||||||
|
|
||||||
|
//h2 (Selects all <h2> tags)
|
||||||
|
|
||||||
|
The prefix `//` means *search at any level of the document*. Now suppose you
|
||||||
|
want to search for ``<span>`` tags that are inside ``<a>`` tags. That can be
|
||||||
|
achieved with::
|
||||||
|
|
||||||
|
//a/span (Selects <span> tags inside <a> tags)
|
||||||
|
|
||||||
|
If you want to search for tags at a particular level in the document, change
|
||||||
|
the prefix::
|
||||||
|
|
||||||
|
/body/div/p (Selects <p> tags that are children of <div> tags that are
|
||||||
|
children of the <body> tag)
|
||||||
|
|
||||||
|
This will match only ``<p>A very short ebook to demonstrate the use of XPath.</p>``
|
||||||
|
in the `Sample ebook`_ but not any of the other ``<p>`` tags.
|
||||||
|
|
||||||
|
Now suppose you want to select both ``<h1>`` and ``<h2>`` tags. To do that,
|
||||||
|
we need a XPath construct called *predicate*. A :dfn:`predicate` is simply
|
||||||
|
a test that is used to select tags. Tests can be arbitrarily powerful and as
|
||||||
|
this tutorial progresses, you will see more powerful examples. A predicate
|
||||||
|
is created by enclosing the test expression in square brackets::
|
||||||
|
|
||||||
|
//*[name()='h1' or name()='h2']
|
||||||
|
|
||||||
|
There are several new features in this XPath expression. The first is the use
|
||||||
|
of the wildcard ``*``. It means *match any tag*. Now look at the test expression
|
||||||
|
``name()='h1' or name()='h2'``. :term:`name()` is an example of a *built-in function*.
|
||||||
|
It simply evaluates to the name of the tag. So by using it, we can select tags
|
||||||
|
whose names are either `h1` or `h2`. XPath has several useful built-in functions.
|
||||||
|
A few more will be introduced in this tutorial.
|
||||||
|
|
||||||
|
Selecting by attributes
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
To select tags based on their attributes, the use of predicates is required::
|
||||||
|
|
||||||
|
//*[@style] (Select all tags that have a style attribute)
|
||||||
|
//*[@class="chapter"] (Select all tags that have class="chapter")
|
||||||
|
//h1[@class="bookTitle"] (Select all h1 tags that have class="bookTitle")
|
||||||
|
|
||||||
|
Here, the ``@`` operator refers to the attributes of the tag. You can use some
|
||||||
|
of the `XPath built-in functions`_ to perform more sophisticated
|
||||||
|
matching on attribute values.
|
||||||
|
|
||||||
|
|
||||||
|
Selecting by tag content
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
Using XPath, you can even select tags based on the text they contain. The best way to do this is
|
||||||
|
to use the power of *regular expressions* via the built-in function :term:`re:test()`::
|
||||||
|
|
||||||
|
//h2[re:test(., 'chapter|section', 'i')] (Selects <h2> tags that contain the words chapter or
|
||||||
|
section)
|
||||||
|
|
||||||
|
Here the ``.`` operator refers to the contents of the tag, just as the ``@`` operator referred
|
||||||
|
to its attributes.
|
||||||
|
|
||||||
|
|
||||||
|
Sample ebook
|
||||||
|
------------
|
||||||
|
|
||||||
|
.. literalinclude:: xpath.xhtml
|
||||||
|
:language: html
|
||||||
|
|
||||||
|
XPath built-in functions
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
.. glossary::
|
||||||
|
|
||||||
|
name()
|
||||||
|
The name of the current tag.
|
||||||
|
|
||||||
|
contains()
|
||||||
|
``contains(s1, s2)`` returns `true` if s1 contains s2.
|
||||||
|
|
||||||
|
re:test()
|
||||||
|
``re:test(src, pattern, flags)`` returns `true` if the string `src` matches the
|
||||||
|
regular expression `pattern`. A particularly useful flag is ``i``, it makes matching
|
||||||
|
case insensitive. A good primer on the syntax for regular expressions can be found
|
||||||
|
at `regexp syntax <http://docs.python.org/lib/re-syntax.html>`_
|
||||||
|
|
19
src/calibre/manual/xpath.xhtml
Normal file
19
src/calibre/manual/xpath.xhtml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>A very short ebook</title>
|
||||||
|
<meta name="charset" value="utf-8" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1 class="bookTitle">A very short ebook</h1>
|
||||||
|
<p style="text-align:right">Written by Kovid Goyal</p>
|
||||||
|
<div class="introduction">
|
||||||
|
<p>A very short ebook to demonstrate the use of XPath.</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h2 class="chapter">Chapter One</h2>
|
||||||
|
<p>This is a truly fascinating chapter.</p>
|
||||||
|
|
||||||
|
<h2 class="chapter">Chapter Two</h2>
|
||||||
|
<p>A worthy continuation of a fine tradition.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
x
Reference in New Issue
Block a user