diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index c2309d9f8b..f13d6e64f5 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -37,10 +37,10 @@ def config(defaults=None): structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]", help=_('''\ An XPath expression to detect chapter titles. The default is to consider

or -

tags that contain the text "chapter" or "book" or "section" as chapter titles. This -is achieved by the expression: "//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]" +

tags that contain the text "chapter" or "book" or "section" as chapter titles. The expression used must evaluate to a list of elements. To disable chapter detection, -use the expression "/". +use the expression "/". See the XPath Tutorial in the calibre User Manual for further +help on using this feature. ''').replace('\n', ' ')) structure('no_chapters_in_toc', ['--no-chapters-in-toc'], default=False, help=_('Don\'t add detected chapters to the Table of Contents')) diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index 174e568909..3a3fb445d7 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -87,7 +87,7 @@ class HTMLProcessor(PreProcessor, LoggingInterface): def rewrite_links(self, olink): ''' Make all links in document relative so that they work in the EPUB container. - Also copies any resources (like image, stylesheets, scripts, etc.) into + Also copies any resources (like images, stylesheets, scripts, etc.) into the local tree. ''' if not isinstance(olink, unicode): @@ -103,7 +103,7 @@ class HTMLProcessor(PreProcessor, LoggingInterface): name, ext = os.path.splitext(name) name += ('_%d'%len(self.resource_map)) + ext shutil.copyfile(link.path, os.path.join(self.resource_dir, name)) - name = 'resources/'+name + name = 'resources/' + name self.resource_map[link.path] = name return name diff --git a/src/calibre/manual/custom.py b/src/calibre/manual/custom.py index 2098369fee..2797829d87 100644 --- a/src/calibre/manual/custom.py +++ b/src/calibre/manual/custom.py @@ -95,7 +95,7 @@ $desc #end #for opt in options ${option(opt)} - ${opt.help.replace('\n', ' ').replace('%default', str(opt.default)) if opt.help else ''} + ${opt.help.replace('\n', ' ').replace('*', '\\*').replace('%default', str(opt.default)) if opt.help else ''} || #end #end diff --git a/src/calibre/manual/glossary.rst b/src/calibre/manual/glossary.rst index 4a366ae491..8ae69f5ff9 100644 --- a/src/calibre/manual/glossary.rst +++ b/src/calibre/manual/glossary.rst @@ -27,4 +27,4 @@ Glossary **URL** *(Uniform Resource Locator)* for example: ``http://example.com`` regexp - **Regular expressions** provide a concise and flexible means for identifying strings of text of interest, such as particular characters, words, or patterns of characters. See http://docs.python.org/lib/re-syntax.html for the syntax of regular expressions used in python. + **Regular expressions** provide a concise and flexible means for identifying strings of text of interest, such as particular characters, words, or patterns of characters. See `regexp syntax `_ for the syntax of regular expressions used in python. diff --git a/src/calibre/manual/index.rst b/src/calibre/manual/index.rst index 1a11820d9c..cfda23762a 100644 --- a/src/calibre/manual/index.rst +++ b/src/calibre/manual/index.rst @@ -29,6 +29,7 @@ Sections conversion metadata faq + xpath glossary Convenience diff --git a/src/calibre/manual/xpath.rst b/src/calibre/manual/xpath.rst new file mode 100644 index 0000000000..9db3783dec --- /dev/null +++ b/src/calibre/manual/xpath.rst @@ -0,0 +1,108 @@ +.. include:: global.rst + +.. _xpath-tutorial: + +XPath Tutorial +============== + +In this tutorial, you will be given a gentle introduction to +`XPath `_, a query language that can be +used to select arbitrary parts of `HTML `_ +documents in |app|. XPath is a widely +used standard, and googling it will yield a ton of information. This tutorial, +however, focuses on using XPath for ebook related tasks like finding chapter +headings in an unstructured HTML document. + +.. contents:: Contents + :depth: 1 + :local: + +Selecting by tagname +---------------------------------------- + +The simplest form of selection is to select tags by name. For example, +suppose you want to select all the ``

`` tags in a document. The XPath +query for this is simply:: + + //h2 (Selects all

tags) + +The prefix `//` means *search at any level of the document*. Now suppose you +want to search for ```` tags that are inside ```` tags. That can be +achieved with:: + + //a/span (Selects tags inside tags) + +If you want to search for tags at a particular level in the document, change +the prefix:: + + /body/div/p (Selects

tags that are children of

tags that are + children of the tag) + +This will match only ``

A very short ebook to demonstrate the use of XPath.

`` +in the `Sample ebook`_ but not any of the other ``

`` tags. + +Now suppose you want to select both ``

`` and ``

`` tags. To do that, +we need a XPath construct called *predicate*. A :dfn:`predicate` is simply +a test that is used to select tags. Tests can be arbitrarily powerful and as +this tutorial progresses, you will see more powerful examples. A predicate +is created by enclosing the test expression in square brackets:: + +//*[name()='h1' or name()='h2'] + +There are several new features in this XPath expression. The first is the use +of the wildcard ``*``. It means *match any tag*. Now look at the test expression +``name()='h1' or name()='h2'``. :term:`name()` is an example of a *built-in function*. +It simply evaluates to the name of the tag. So by using it, we can select tags +whose names are either `h1` or `h2`. XPath has several useful built-in functions. +A few more will be introduced in this tutorial. + +Selecting by attributes +----------------------- + +To select tags based on their attributes, the use of predicates is required:: + + //*[@style] (Select all tags that have a style attribute) + //*[@class="chapter"] (Select all tags that have class="chapter") + //h1[@class="bookTitle"] (Select all h1 tags that have class="bookTitle") + +Here, the ``@`` operator refers to the attributes of the tag. You can use some +of the `XPath built-in functions`_ to perform more sophisticated +matching on attribute values. + + +Selecting by tag content +------------------------ + +Using XPath, you can even select tags based on the text they contain. The best way to do this is +to use the power of *regular expressions* via the built-in function :term:`re:test()`:: + + //h2[re:test(., 'chapter|section', 'i')] (Selects

tags that contain the words chapter or + section) + +Here the ``.`` operator refers to the contents of the tag, just as the ``@`` operator referred +to its attributes. + + +Sample ebook +------------ + +.. literalinclude:: xpath.xhtml + :language: html + +XPath built-in functions +------------------------ + +.. glossary:: + + name() + The name of the current tag. + + contains() + ``contains(s1, s2)`` returns `true` if s1 contains s2. + + re:test() + ``re:test(src, pattern, flags)`` returns `true` if the string `src` matches the + regular expression `pattern`. A particularly useful flag is ``i``, it makes matching + case insensitive. A good primer on the syntax for regular expressions can be found + at `regexp syntax `_ + diff --git a/src/calibre/manual/xpath.xhtml b/src/calibre/manual/xpath.xhtml new file mode 100644 index 0000000000..7468e3d856 --- /dev/null +++ b/src/calibre/manual/xpath.xhtml @@ -0,0 +1,19 @@ + + + A very short ebook + + + +

A very short ebook

+

Written by Kovid Goyal

+
+

A very short ebook to demonstrate the use of XPath.

+
+ +

Chapter One

+

This is a truly fascinating chapter.

+ +

Chapter Two

+

A worthy continuation of a fine tradition.

+ +