mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KG 0.7.44
This commit is contained in:
commit
9105f0e3f7
118
Changelog.yaml
118
Changelog.yaml
@ -19,6 +19,124 @@
|
|||||||
# new recipes:
|
# new recipes:
|
||||||
# - title:
|
# - title:
|
||||||
|
|
||||||
|
- version: 0.7.44
|
||||||
|
date: 2011-02-04
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Nook Color driver: Send downloaded news to the My Files/Magazines folder on the Nook Color. Also when getting the list of books on the device look at all folders in My Files, not just My Files/Books."
|
||||||
|
|
||||||
|
- title: "MOBI Output: Use the book uuid as the ASIN field and set cdetype to EBOK to allow Amazon furthest read tracking to work with calibre generated MOBI files."
|
||||||
|
tickets: [8721]
|
||||||
|
|
||||||
|
- title: "Comic input: Add an option to override the image size in the generated comic. Useful if you have a device whose screen size is not coverred by one of the available output profiles."
|
||||||
|
tickets: [7837]
|
||||||
|
|
||||||
|
- title: "Add a restore database option to the Library maintenance menu in the GUI"
|
||||||
|
|
||||||
|
- title: "TXT Output: Allow output in the textile markup language"
|
||||||
|
|
||||||
|
- title: "PML Output: Create multi-level Table of Contents"
|
||||||
|
|
||||||
|
- title: "Driver for the Archos 7O"
|
||||||
|
|
||||||
|
- title: "Search and Replace in the Bulk metadata dialog can now operate on the title_sort field as well"
|
||||||
|
tickets: [8732]
|
||||||
|
|
||||||
|
- title: "Allow changing the case of authors/tags/series etc. via the edit metadata dialog"
|
||||||
|
|
||||||
|
- title: "Connect/share menu: Re-organize to make it a little less easy to select email and delete instead of just email by mistake"
|
||||||
|
|
||||||
|
- title: "Heuristics: Improved Scene break detection and add option to control what scene breaks are replaced by."
|
||||||
|
|
||||||
|
- title: "SONY driver: Add option to not preserve aspect ratio of cover thumbnails."
|
||||||
|
|
||||||
|
- title: "BiBTeX catalog: Add on device column when available"
|
||||||
|
|
||||||
|
- title: "Add search to the plugin preferences dialog"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Fix a bug that could cause fiels to be lost when changing metadata on east asian windows installs if the title and/or author is very long."
|
||||||
|
tickets: [8620]
|
||||||
|
|
||||||
|
- title: "Tag browser: Fix searching with items in a user category not owrking if the main category is hidden"
|
||||||
|
tickets: [8741]
|
||||||
|
|
||||||
|
- title: "Make completion for author/series/tags/etc. fields less disruptive"
|
||||||
|
|
||||||
|
- title: "Fix regression that broke the content server when user categories/custom columns are present"
|
||||||
|
|
||||||
|
- title: "Catalog generation: Handle user supplied templates more robustly"
|
||||||
|
|
||||||
|
- title: "Move the Tags to apply to newly added books option into Preferences->Adding books"
|
||||||
|
tickets: [8730]
|
||||||
|
|
||||||
|
- title: "Workaround for bug in Qt on OS X that caused crashes when reading metedata from two or more EPUB files with HTML covers that used embedded fonts. Now the embedded fonts are ignored on OS X."
|
||||||
|
tickets: [8643]
|
||||||
|
|
||||||
|
- title: "Fix regression that broke the use of the group searched terms tweak"
|
||||||
|
tickets: [8739]
|
||||||
|
|
||||||
|
- title: "Fix template program regression triggered by recursively calling the processor"
|
||||||
|
|
||||||
|
- title: "Fix mimetype sent by content server for PDB files"
|
||||||
|
|
||||||
|
- title: "OPF: Write title_sort as a calibre custom metadata field rather than as a file-as attribute on the title. This conforms to the OPF spec"
|
||||||
|
tickets: [7883]
|
||||||
|
|
||||||
|
- title: "SONY driver: Fix thumbnails being sent to SD card are sent to the wrong location. Also use correct thumbnail size so that the SONY does not regenerate the thumbnail on disconnect"
|
||||||
|
|
||||||
|
- title: "Do not discard the result of a conversion if the user opens the edit metadata dialog while the conversion is running"
|
||||||
|
tickets: [8672]
|
||||||
|
|
||||||
|
- title: "CHM Input: When the chm file lacks a hhc, lookf for index.html instead"
|
||||||
|
tickets: [8688]
|
||||||
|
|
||||||
|
- title: "EPUB Input: Filter some invalid media types from the spine"
|
||||||
|
|
||||||
|
- title: "RTF Input: More encoding handlig fixes."
|
||||||
|
tickets: [8678]
|
||||||
|
|
||||||
|
- title: "Linux binary build: Restore functioning of CALIBRE_DEVELOP_FROM, which was accidentally removed a few versions ago"
|
||||||
|
|
||||||
|
- title: "RTF Output: Retain html headings as rtf headings when converting to rtf. Also fix output of italics."
|
||||||
|
tickets: [8641, 8640]
|
||||||
|
|
||||||
|
- title: "LIT Input: Fix regression that broke handling of LIT files that contain txt data instead of html"
|
||||||
|
|
||||||
|
- title: "MOBI Input: Handle more non printing ASCII codes"
|
||||||
|
tickets: [8646]
|
||||||
|
|
||||||
|
- title: "Handle empty cover files more gracefully"
|
||||||
|
tickets: [8656]
|
||||||
|
|
||||||
|
- title: "Catalog geenration: Fix error when Pocketbook is connected and trying to geenrate catalog"
|
||||||
|
tickets: [8651]
|
||||||
|
|
||||||
|
- title: "Heuristics: Italicize common cases, reduce false positives."
|
||||||
|
|
||||||
|
- title: "Fix regression that caused reporting of device connection errors to break"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- MSN Japan
|
||||||
|
- The Onion
|
||||||
|
- La Tribuna de
|
||||||
|
- Wall Street Journal
|
||||||
|
- "20 Minutos"
|
||||||
|
- LA Times
|
||||||
|
- Endgadget Japan
|
||||||
|
- Ledevoir
|
||||||
|
- Vijesti
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: "Cinco Dias and BBC Mundo"
|
||||||
|
author: Luis Hernandez
|
||||||
|
|
||||||
|
- title: "Explosm"
|
||||||
|
author: Andromeda Rabbit
|
||||||
|
|
||||||
|
- title: "Cinco Dias"
|
||||||
|
author: Luis Hernandez
|
||||||
|
|
||||||
|
|
||||||
- version: 0.7.43
|
- version: 0.7.43
|
||||||
date: 2011-01-28
|
date: 2011-01-28
|
||||||
|
54
format_docs/compression/palmdoc.txt
Normal file
54
format_docs/compression/palmdoc.txt
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
About
|
||||||
|
-----
|
||||||
|
|
||||||
|
PalmDOC uses LZ77 compression techniques. DOC files can contain only compressed
|
||||||
|
text. The format does not allow for any text formatting. This keeps files
|
||||||
|
small, in keeping with the Palm philosophy. However, extensions to the format
|
||||||
|
can use tags, such as HTML or PML, to include formatting within text. These
|
||||||
|
extensions to PalmDoc are not interchangeable and are the basis for most eBook
|
||||||
|
Reader formats on Palm devices.
|
||||||
|
|
||||||
|
LZ77 algorithms achieve compression by replacing portions of the data with
|
||||||
|
references to matching data that has already passed through both encoder and
|
||||||
|
decoder. A match is encoded by a pair of numbers called a length-distance pair,
|
||||||
|
which is equivalent to the statement "each of the next length characters is
|
||||||
|
equal to the character exactly distance characters behind it in the
|
||||||
|
uncompressed stream." (The "distance" is sometimes called the "offset" instead.)
|
||||||
|
|
||||||
|
In the PalmDoc format, a length-distance pair is always encoded by a two-byte
|
||||||
|
sequence. Of the 16 bits that make up these two bytes, 11 bits go to encoding
|
||||||
|
the distance, 3 go to encoding the length, and the remaining two are used to
|
||||||
|
make sure the decoder can identify the first byte as the beginning of such a
|
||||||
|
two-byte sequence.
|
||||||
|
|
||||||
|
PalmDoc combines LZ77 with a simple kind of byte pair compression.
|
||||||
|
|
||||||
|
|
||||||
|
PalmDoc files are decoded as follows:
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
|
Read a byte from the compressed stream. If the byte is
|
||||||
|
|
||||||
|
0x00: "1 literal" copy that byte unmodified to the decompressed stream.
|
||||||
|
|
||||||
|
0x09 to 0x7f: "1 literal" copy that byte unmodified to the decompressed stream.
|
||||||
|
|
||||||
|
0x01 to 0x08: "literals": the byte is interpreted as a count from 1 to 8, and
|
||||||
|
that many literals are copied unmodified from the compressed stream to the
|
||||||
|
decompressed stream.
|
||||||
|
|
||||||
|
0x80 to 0xbf: "length, distance" pair: the 2 leftmost bits of this byte ('10')
|
||||||
|
are discarded, and the following 6 bits are combined with the 8 bits of the
|
||||||
|
next byte to make a 14 bit "distance, length" item. Those 14 bits are broken
|
||||||
|
into 11 bits of distance backwards from the current location in the
|
||||||
|
uncompressed text, and 3 bits of length to copy from that point
|
||||||
|
(copying n+3 bytes, 3 to 10 bytes).
|
||||||
|
|
||||||
|
0xc0 to 0xff: "byte pair": this byte is decoded into 2 characters: a space
|
||||||
|
character, and a letter formed from this byte XORed with 0x80.
|
||||||
|
|
||||||
|
Repeat from the beginning until there is no more bytes in the compressed file.
|
||||||
|
|
||||||
|
PalmDOC data is always divided into 4096 byte blocks and the blocks are acted
|
||||||
|
upon independently.
|
||||||
|
|
3217
format_docs/compression/zip.txt
Normal file
3217
format_docs/compression/zip.txt
Normal file
File diff suppressed because it is too large
Load Diff
309
format_docs/pdb/ereader.txt
Normal file
309
format_docs/pdb/ereader.txt
Normal file
@ -0,0 +1,309 @@
|
|||||||
|
About
|
||||||
|
-----
|
||||||
|
|
||||||
|
The eReader format has evolved and changed over time. Subsequently, there are
|
||||||
|
multiple versions of the eReader format. There are also two different tools
|
||||||
|
that can create eReader files. The official tools are Makebook and Dropbook.
|
||||||
|
Dropbook is the newer official tool that has replaced Makebook. However,
|
||||||
|
Makebook is still in wide use because it supports a wider range of platforms
|
||||||
|
than Dropbook. Dropbook is a GUI application that only runs on Windows and
|
||||||
|
Apple’s OS X.
|
||||||
|
|
||||||
|
|
||||||
|
PDB Identiy
|
||||||
|
-------
|
||||||
|
|
||||||
|
PNRdPPrs
|
||||||
|
|
||||||
|
|
||||||
|
202 and 132 headers
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
|
Older files have a record 0 size of 202 and occasionally 116. Newer files have
|
||||||
|
a record 0 size of 132. As of this writing the 202 files only support text and
|
||||||
|
images. The image format in the 202 files is the same as the 132 files. The 132
|
||||||
|
files support a number of additional features.
|
||||||
|
|
||||||
|
|
||||||
|
Record 0, eReader header (202)
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Note all values are in 2 byte increments. Like values are condensed into a
|
||||||
|
range. The range can be borken into 2 byte sections which represent the actual
|
||||||
|
stored values.
|
||||||
|
|
||||||
|
bytes content comments
|
||||||
|
|
||||||
|
0-2 Version Non-DRM books 2 and 4.
|
||||||
|
2-8 Garbage
|
||||||
|
8-10 Non-Text Offset Start of Non text area (images) will run to the
|
||||||
|
end of the section list.
|
||||||
|
10-14 Unknown
|
||||||
|
14-24 Garbage
|
||||||
|
24-28 Unknown
|
||||||
|
28-98 Garbage
|
||||||
|
98-100 Unknown
|
||||||
|
100-110 Garbage
|
||||||
|
110-114 Unknown
|
||||||
|
114-116 Garbage
|
||||||
|
116-202 Unknown
|
||||||
|
|
||||||
|
* Garbage: Intentially random values.
|
||||||
|
|
||||||
|
|
||||||
|
Text Records (202)
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Text starts with section 1 and continues until the section indicated by the
|
||||||
|
Non-Text Offset. All text records are PalmDoc compressed.
|
||||||
|
|
||||||
|
Each character in the compressed data is xored with 0xA5.
|
||||||
|
|
||||||
|
A decompression example in sudo Python:
|
||||||
|
|
||||||
|
for num in range(1, Non-Text Offset):
|
||||||
|
text += decompress_pamldoc(''.join([chr(ord(x) ^ 0xA5) for x in section_data(num)])).decode('cp1252', 'replace')
|
||||||
|
|
||||||
|
|
||||||
|
Dropbook 132 files
|
||||||
|
------------------
|
||||||
|
|
||||||
|
The following sections apply to the newer Dropbook created files.
|
||||||
|
|
||||||
|
|
||||||
|
Record 0, eReader header (132)
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
This is only for 132 byte header files created by Dropbook.
|
||||||
|
|
||||||
|
bytes content comments
|
||||||
|
|
||||||
|
0-2 compression Specifies compression and drm. 2 = palmdoc,
|
||||||
|
10 = zlib. 260 and 272 = DRM
|
||||||
|
2-6 unknown Value of 0 is used
|
||||||
|
6-8 encoding Always 25152 (0x6240). All text must be
|
||||||
|
encoded as Latin-1 cp1252
|
||||||
|
8-10 Number of small pages The number of small font pages. If page
|
||||||
|
index is not build in then 0.
|
||||||
|
10-12 Number of large pages The number of large font pages. If page
|
||||||
|
index is not build in then 0.
|
||||||
|
12-14 Non-Text record start The location of the first non text records.
|
||||||
|
record 1 to this value minus 1 are all text
|
||||||
|
records
|
||||||
|
14-16 Number of chapters The number of chapter index records
|
||||||
|
contained in the file
|
||||||
|
16-18 Number of small index The number of small font page index records
|
||||||
|
contained in the file
|
||||||
|
18-20 Number of large index The number of large font page index records
|
||||||
|
contained in the file
|
||||||
|
20-22 Number of images The number of images contained in the file
|
||||||
|
22-24 Number of links The number of links contained in the file
|
||||||
|
24-26 Metadata avaliable Is there a metadata record in the file?
|
||||||
|
0 = None, 1 = There is a metadata record
|
||||||
|
26-28 Unknown Value of 0 is used
|
||||||
|
28-30 Number of Footnotes The number of footnote records in the file
|
||||||
|
30-32 Number of Sidebars The number of sidebar records in the file
|
||||||
|
32-34 Chapter index record start The location of chapter index records. If
|
||||||
|
there are no chapters use the value for the
|
||||||
|
Last data record.
|
||||||
|
34-36 2560 Magic value that must be set to 2560
|
||||||
|
36-38 Small page index start The location of small font page index
|
||||||
|
records. If page table is not built in use
|
||||||
|
the value for the Last data record.
|
||||||
|
38-40 Large page index start The location of large font page index
|
||||||
|
records. If page table is not built in use
|
||||||
|
the value for the Last data record.
|
||||||
|
40-42 Image data record start The location of the first image record. If
|
||||||
|
there are no images use the value for the
|
||||||
|
Last data record.
|
||||||
|
42-44 Links record start The location of the first link index
|
||||||
|
record. If there are no links use the value
|
||||||
|
for the Last data record.
|
||||||
|
44-46 Metadata record start The location of the metadata record. If
|
||||||
|
there is no metadata use the value for the
|
||||||
|
Last data record.
|
||||||
|
46-48 Unknown Value of 0 is used
|
||||||
|
48-50 Footnote record start The location of the first footnote record.
|
||||||
|
If there are no footnotes use the value for
|
||||||
|
the Last data record.
|
||||||
|
50-52 Sidebar record start The location of the first sidebar record.
|
||||||
|
If there are no sidebars use the value for
|
||||||
|
the Last data record.
|
||||||
|
52-54 Last data record The location of the last data record
|
||||||
|
54-132 Unknown Value of 0 is used
|
||||||
|
|
||||||
|
Note: All values are in 2 byte increments. All bytes in the table that have a
|
||||||
|
range larger than 2 can be broken into 2 byte segments and have different
|
||||||
|
values set for each grouping.
|
||||||
|
|
||||||
|
|
||||||
|
Records Order
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Though the order of this sections is described in eReader header,
|
||||||
|
DropBook makes the following order:
|
||||||
|
|
||||||
|
1. eReader Header
|
||||||
|
2. Compressed text
|
||||||
|
3. Small font page index
|
||||||
|
4. Large font page index
|
||||||
|
5. Chapter index
|
||||||
|
6. Links index
|
||||||
|
7. Images
|
||||||
|
8. (Extrapolation: there should be one more record type here though it has
|
||||||
|
not yet been uncovered what it might be).
|
||||||
|
9. Metadata
|
||||||
|
10. Sidebar records
|
||||||
|
11. Footnote records
|
||||||
|
12. Text block size record
|
||||||
|
13. "MeTaInFo\x00" word record
|
||||||
|
|
||||||
|
|
||||||
|
Text Records
|
||||||
|
------------
|
||||||
|
|
||||||
|
All text records use cp1252 encoding (although eReader documents talk about
|
||||||
|
UTF-8 as well). Their total compressed size is unknown however, anything below
|
||||||
|
3560 Bytes is known to work. The text will be either zlib or palmdoc
|
||||||
|
compressed. Use the compression value from the eReader header to determine
|
||||||
|
which. All text utalizes the Palm Markup Language (PML) for formatting.
|
||||||
|
|
||||||
|
Starting with DropBook 1.6.0 text is divided into 8KB (8192 bytes) blocks
|
||||||
|
trimming the end to the closest space character and then being compressed.
|
||||||
|
Earlier version of DropBook 1.5.2 tries to behave the same way, though
|
||||||
|
sometimes it trims the block in unexpected place.
|
||||||
|
|
||||||
|
|
||||||
|
Chapter Index Records
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Each chapter record corresponds to 1 chapter and points at the place in the
|
||||||
|
book. Chapter record takes a form of 'offset name\x00' First 4 bytes are offset
|
||||||
|
of the original pml file where the chapter index points to (offset of
|
||||||
|
the \x|\X?|\C? tags). Then without a space goes a name of a chapter in chapter
|
||||||
|
index. It should contain only text, all formatting tags should be removed.
|
||||||
|
\U and \a tags are not permitted in chapter name. To maintain sub-chapters
|
||||||
|
4*n spaces (\x20) are added to the beginning of the name, where "n" is level of
|
||||||
|
chapter: 0 for \x tag and N for \CN="" and \XN tags. And then an ending
|
||||||
|
\x00 symbol.
|
||||||
|
|
||||||
|
|
||||||
|
Image Records
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Image records must be smaller than 65505 Bytes. They must also be 8bit PNG
|
||||||
|
images.
|
||||||
|
|
||||||
|
An image record takes the form 'PNG name\x00... image_data'
|
||||||
|
|
||||||
|
bytes content comments
|
||||||
|
|
||||||
|
0-4 PNG There must be a space after PNG.
|
||||||
|
4-36 image name. The image name must be 32 exactly 32 Bytes long. Pad
|
||||||
|
the right side of the name with \x00 characters for
|
||||||
|
names shorter than 32 characters.
|
||||||
|
36-58 Unknown
|
||||||
|
58-60 width Width of an image
|
||||||
|
60-62 height Height of an image
|
||||||
|
62-? The image data raw image data in 8 bit PNG format
|
||||||
|
|
||||||
|
Note: DropBooks seems to change something in png raw data. Like reencoding or
|
||||||
|
something, but plain insertion of png image there still works.
|
||||||
|
|
||||||
|
|
||||||
|
Links Records
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Links records are constructed the same way as chapter ones. Each link anchor
|
||||||
|
record corresponds to 1 link anchor and points at the place in the book. Link
|
||||||
|
record takes a form of 'offset name\x00' First 4 bytes are offset of the
|
||||||
|
original pml file where the link anchor points to (offset of the \Q tag). Then
|
||||||
|
without a space goes a name of a link anchor. It should contain only text, all
|
||||||
|
formatting tags should be removed. \U and \a tags are not permitted in link
|
||||||
|
anchor name. And then an ending \x00 symbol.
|
||||||
|
|
||||||
|
|
||||||
|
Footnote Records
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The first footnote record is a \x00 separated list of footnote ids. All
|
||||||
|
subsequent footnote records are the footnote text corresponding to the id's
|
||||||
|
position in the list. Footnote text is compressed in the same manner as normal
|
||||||
|
text records
|
||||||
|
|
||||||
|
E.G.
|
||||||
|
|
||||||
|
footnote section 1 = 'notice1\x00notice2\x00notice3\x00'
|
||||||
|
footnote section 2 = 'Text for notice 1'
|
||||||
|
footnote section 3 = 'Text for notice 2'
|
||||||
|
footnote section 4 = 'Text for notice 3'
|
||||||
|
|
||||||
|
Starting with Dropbook 1.5.2 first record looks a bit different. It is sequence
|
||||||
|
of \x00\x01 then 1 byte of footnote id length, then footnote id then \x00.
|
||||||
|
|
||||||
|
E.G.
|
||||||
|
|
||||||
|
footnote section 1 = '\x00\x01\x07notice1\x00\x00\x01\x0Afootnote10\x00'
|
||||||
|
|
||||||
|
|
||||||
|
Sidebar Records
|
||||||
|
---------------
|
||||||
|
|
||||||
|
The first sidebar record is a \x00 separated list of sidebar ids. All
|
||||||
|
subsequent sidebar records are the sidebar text corresponding to the id's
|
||||||
|
position in the list. Sidebar text is compressed in the same manner as normal
|
||||||
|
text records
|
||||||
|
|
||||||
|
E.G.
|
||||||
|
|
||||||
|
sidebar section 1 = 'notice1\x00notice2\x00notice3\x00'
|
||||||
|
sidebar section 2 = 'Text for notice 1'
|
||||||
|
sidebar section 3 = 'Text for notice 2'
|
||||||
|
sidebar section 4 = 'Text for notice 3'
|
||||||
|
|
||||||
|
Starting with Dropbook 1.5.2 first record looks a bit different. It is sequence
|
||||||
|
of \x00\x01 then 1 byte of sidebar's id length, then sidebar's id then \x00.
|
||||||
|
|
||||||
|
E.G.
|
||||||
|
|
||||||
|
sidebar section 1 = '\x00\x01\x07notice1\x00\x00\x01\x09sidebar10\x00'
|
||||||
|
|
||||||
|
|
||||||
|
Metadata Record
|
||||||
|
---------------
|
||||||
|
|
||||||
|
\x00 separated list of string.
|
||||||
|
|
||||||
|
Metadata takes the form:
|
||||||
|
|
||||||
|
title\x00
|
||||||
|
author\x00
|
||||||
|
copyright\x00
|
||||||
|
publisher\x00
|
||||||
|
isbn\x00
|
||||||
|
|
||||||
|
E.G.
|
||||||
|
|
||||||
|
Gibraltar Earth\x00Michael McCollum\x001999\x00Sci Fi Arizona\x001929381255\x00
|
||||||
|
|
||||||
|
The metdata record is always followed by a record which contains 'MeTaInFo\x00'
|
||||||
|
|
||||||
|
Note: Starting with DropBook 1.5.2 'MeTaInFo\x00' is not following Metadata
|
||||||
|
Record. It is a separate record that ends the file and there are some more
|
||||||
|
records between Metadata record and 'MeTaInFo\x00' record.
|
||||||
|
|
||||||
|
|
||||||
|
Text Sizes Record
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
There is a special record that contains the initial size of all text blocks
|
||||||
|
before compression. It is just a sequence of 2-byte blocks which are containing
|
||||||
|
the sizes.
|
||||||
|
|
||||||
|
E.G.
|
||||||
|
|
||||||
|
\x1F\xFB\x20\x00\x20\x00\x1F\xFE\x1F\xFD\x09\x46
|
||||||
|
|
||||||
|
Note: By this we can judge that theoretical maximum of initial block size is
|
||||||
|
65535 bytes.
|
||||||
|
|
414
format_docs/pdb/mbp.txt
Normal file
414
format_docs/pdb/mbp.txt
Normal file
@ -0,0 +1,414 @@
|
|||||||
|
// BEGINING OF FILE
|
||||||
|
// NOTES:
|
||||||
|
// 1* Numeric data stored as big endian, 32 bits.
|
||||||
|
// 2* Data padded to 16 bits limits. (Sometimes to 32 bits limits?)
|
||||||
|
// 3* Text stored seems to be an 8 bit encoding padded to 16 bits
|
||||||
|
// (may be "ISO-8859-1"?, or may be just a local machine character set?)
|
||||||
|
// 4* I initially used the term "MARK" where I should have used "HIGHLIGTH",
|
||||||
|
// bear that in mind (it was a bad name election when I started reversing)
|
||||||
|
|
||||||
|
<0x 31 bytes = book_title_PAR + 0x00 PAD if (book_title_PAR < 31) >
|
||||||
|
<0x 00>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
...4
|
||||||
|
...4
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
BPAR
|
||||||
|
MOBI
|
||||||
|
<0x 4 bytes = Next free pointer identifier>
|
||||||
|
// Note: pointer identifiers aren't always consecutive,
|
||||||
|
// so this number is usually bigger than de # of index entries
|
||||||
|
<0x 00 00>
|
||||||
|
<0x 4 bytes = Number of index entries>
|
||||||
|
<0x 4 bytes = Position of BPAR>
|
||||||
|
<0x 00 00 00 00> // BPAR pointer identifier = 0x0
|
||||||
|
|
||||||
|
|
||||||
|
// INDEXES:
|
||||||
|
// Order of Indexes: from the beginning of this MBP file,
|
||||||
|
// forward to the end of the file.
|
||||||
|
// Nevertheless, see these comments for order relative to:
|
||||||
|
// "BEGINING OF USER DATA": order of Data marks.
|
||||||
|
// "FINAL GROUP OF MARKS": order of final marks.
|
||||||
|
[for each {NOTE,MARK,CORRECTION,DRAWING,BOOKMARK,
|
||||||
|
AUTHOR,TITLE,CATEGORY,GENRE,ABSTRACT,COVER,PUBLISHER,
|
||||||
|
...}
|
||||||
|
|| "last DATA"]
|
||||||
|
// Note: Pointer identifiers to DATA's assigned so the number
|
||||||
|
// shrinks as the table grows down.
|
||||||
|
[if NOTE || CORRECTION]
|
||||||
|
<0x 4 bytes = Position of DATA....EBVS>
|
||||||
|
<0x 4 bytes = Pointer identifier, used by BKMK blocks>
|
||||||
|
[fi NOTE || CORRECTION]
|
||||||
|
<0x 4 bytes = Position of DATA>
|
||||||
|
<0x 4 bytes = Pointer identifier, used by BKMK blocks>
|
||||||
|
[if NOTE || CORRECTION]
|
||||||
|
<0x 4 bytes = Position of DATA>
|
||||||
|
<0x 4 bytes = Pointer identifier, used by BKMK blocks>
|
||||||
|
[fi NOTE || CORRECTION]
|
||||||
|
[if MARK || DRAWING || BOOKMARK]
|
||||||
|
<0x 4 bytes = Position of DATA....EBVS>
|
||||||
|
<0x 4 bytes = Pointer identifier, used by BKMK blocks>
|
||||||
|
[fi MARK || DRAWING || BOOKMARK]
|
||||||
|
[if AUTHOR || TITLE || CATEGORY || GENRE || ABSTRACT || COVER || PUBLISHER]
|
||||||
|
<0x 4 bytes = Position of [AUTH || TITL || CATE || GENR || ABST || COVE || PUBL] >
|
||||||
|
<0x 4 bytes = Pointer identifier>
|
||||||
|
[fi AUTHOR || TITLE || CATEGORY || GENRE || ABSTRACT || COVER || PUBLISHER]
|
||||||
|
[if last DATA] // there's always a last piece of DATA (not user data?)
|
||||||
|
<0x 4 bytes = Position of last DATA>
|
||||||
|
<0x 4 bytes = Pointer identifier> // usually <0x 00 00 00 01>
|
||||||
|
[fi last DATA]
|
||||||
|
[next {NOTE,MARK,CORRECTION,DRAWING,BOOKMARK,
|
||||||
|
AUTHOR,TITLE,CATEGORY,GENRE,ABSTRACT,COVER,PUBLISHER,
|
||||||
|
...}
|
||||||
|
|| "last DATA"]
|
||||||
|
|
||||||
|
|
||||||
|
[for each {NOTE,MARK,CORRECTION,DRAWING}]
|
||||||
|
<0x 4 bytes = Position of BKMK>
|
||||||
|
<0x 4 bytes = Pointer identifier>
|
||||||
|
// Note: pointer identifiers for BKMK's are usually the minor
|
||||||
|
// of all the identifiers associated to an annotation. All
|
||||||
|
// other DATA references in INDEXES table associated to this
|
||||||
|
// BKMK, have bigger pointer identifiers.
|
||||||
|
// Note: Pointer identifiers to BKMK's assigned so the number
|
||||||
|
// grows as the table grows down.
|
||||||
|
[next {NOTE,MARK,CORRECTION,DRAWING}]
|
||||||
|
|
||||||
|
|
||||||
|
<0x 2 bytes random PAD>
|
||||||
|
BPAR
|
||||||
|
<0x 4 bytes = size of BPAR block>
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
...4 <-- 'position of last read' related
|
||||||
|
...4 <-- 'position of last read' related
|
||||||
|
...4
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
...4
|
||||||
|
...4
|
||||||
|
...4 <-- 'position of last read' related
|
||||||
|
...(rest of size of BPAR block, if bigger than 0x20)
|
||||||
|
[if (size of BPAR block) mod 32 != 0]
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
[fi]
|
||||||
|
|
||||||
|
// BEGINING OF USER DATA:
|
||||||
|
// Order of {NOTE,MARK,CORRECTION,DRAWING} :
|
||||||
|
// starts with user data at the end of the file,
|
||||||
|
// going backwards to the begining of the file:
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[for each {NOTE,MARK,CORRECTION,DRAWING}]
|
||||||
|
//-------------------------------
|
||||||
|
[if NOTE]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of DATA block>
|
||||||
|
[if EBAR] // this block can appear, or not... ???
|
||||||
|
EBAR
|
||||||
|
...various {4 x byte} ???
|
||||||
|
[fi EBAR]
|
||||||
|
EBVS
|
||||||
|
<0x 00 00 00 03> ???
|
||||||
|
<0x 4 bytes = IDENTIFIER> ???
|
||||||
|
[<0x 00 00 00 01>, or nothing at all] ???
|
||||||
|
<0x 00 00 00 08>
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
<0x 00 00 00 10>
|
||||||
|
...(rest of size of DATA block)
|
||||||
|
<0x FD EA = PAD? (ýê)>
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of <marked text (see 3rd note)> >
|
||||||
|
<marked text (see 3rd note)>
|
||||||
|
[if (size of <marked text (see 3rd note)>) mod 4 !=0]
|
||||||
|
<0x random PAD until (size of <marked text (see 3rd note)>) mod 4 ==0>
|
||||||
|
[fi]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of <note text (see 3rd note)> >
|
||||||
|
<note text (see 3rd note)>
|
||||||
|
[if (size of <note text (see 3rd note)>) mod 4 !=0]
|
||||||
|
<0x random PAD until (size of <note text (see 3rd note)>) mod 4 ==0>
|
||||||
|
[fi]
|
||||||
|
[fi NOTE]
|
||||||
|
//-------------------------------
|
||||||
|
[if MARK || BOOKMARK]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of <marked text (see 3rd note)> >
|
||||||
|
<marked text (see 3rd note)>
|
||||||
|
[if (size of <marked text (see 3rd note)>) mod 4 !=0]
|
||||||
|
<0x random PAD until (size of <marked text (see 3rd note)>) mod 4 ==0>
|
||||||
|
[fi]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of DATA block>
|
||||||
|
[if EBAR] // this block can appear, or not... ???
|
||||||
|
EBAR
|
||||||
|
...various {4 x byte} ???
|
||||||
|
[fi EBAR]
|
||||||
|
EBVS
|
||||||
|
<0x 00 00 00 03> ???
|
||||||
|
<0x 4 bytes = IDENTIFIER> ???
|
||||||
|
[<0x 00 00 00 01>, or nothing at all] ???
|
||||||
|
<0x 00 00 00 08>
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
<0x 00 00 00 10>
|
||||||
|
...(rest of size of DATA block)
|
||||||
|
<0x FD EA = PAD? (ýê)>
|
||||||
|
[fi MARK || BOOKMARK]
|
||||||
|
//-------------------------------
|
||||||
|
[if CORRECTION]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of DATA block>
|
||||||
|
[if EBAR] // this block can appear, or not... ???
|
||||||
|
EBAR
|
||||||
|
...various {4 x byte} ???
|
||||||
|
[fi EBAR]
|
||||||
|
EBVS
|
||||||
|
<0x 00 00 00 03> ???
|
||||||
|
<0x 4 bytes = IDENTIFIER> ???
|
||||||
|
[<0x 00 00 00 01>, or nothing at all] ???
|
||||||
|
<0x 00 00 00 08>
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
<0x 00 00 00 10>
|
||||||
|
...(rest of size of DATA block)
|
||||||
|
<0x FD EA = PAD? (ýê)>
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of <marked text (see 3rd note)> >
|
||||||
|
<marked text (see 3rd note)>
|
||||||
|
[if (size of <marked text (see 3rd note)>) mod 4 !=0]
|
||||||
|
<0x random PAD until (size of <marked text (see 3rd note)>) mod 4 ==0>
|
||||||
|
[fi]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of <note text (see 3rd note)> >
|
||||||
|
<note text (see 3rd note)>
|
||||||
|
[if (size of <note text (see 3rd note)>) mod 4 !=0]
|
||||||
|
<0x random PAD until (size of <note text (see 3rd note)>) mod 4 ==0>
|
||||||
|
[fi]
|
||||||
|
[fi CORRECTION]
|
||||||
|
//-------------------------------
|
||||||
|
[if DRAWING]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of raw data>
|
||||||
|
ADQM
|
||||||
|
// NOTE: bakground color is stored in corresponding BKMK.
|
||||||
|
[begin DRAWING format]
|
||||||
|
...4 = <0x 00 00 00 01> ???
|
||||||
|
<0x 4 bytes = X POSITION OF UPPER LEFT CORNER??? >
|
||||||
|
<0x 4 bytes = Y POSITION OF UPPER LEFT CORNER??? >
|
||||||
|
<0x 4 bytes = X SIZE in pixels >
|
||||||
|
<0x 4 bytes = Y SIZE in pixels >
|
||||||
|
...4 = <0x 00 00 00 00> ???
|
||||||
|
<0x 4 bytes = number of STROKES>
|
||||||
|
[if "number of STROKES" == 0]
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
[end DRAWING format]
|
||||||
|
[fi]
|
||||||
|
[for each STROKE]
|
||||||
|
<0x 00 00 00 01> ???
|
||||||
|
<0x 4 bytes> =
|
||||||
|
Stroke's beginning position in list of coordinates.
|
||||||
|
<0x 4 bytes> =
|
||||||
|
Stroke's ending position in list of coordinates.
|
||||||
|
<0x 00 RR GG BB> = RRGGBB color of stroke.
|
||||||
|
[next STROKE]
|
||||||
|
<0x 4 bytes> = number of coordinate pairs in array of coordinates.
|
||||||
|
// NOTE: each stroke is formed out of at least three
|
||||||
|
// coordinate pairs: begin, {next point}(1-n), end point.
|
||||||
|
[for each COORDINATE]
|
||||||
|
<0x 4 bytes> = X coordinate
|
||||||
|
<0x 4 bytes> = Y coordinate
|
||||||
|
[next COORDINATE]
|
||||||
|
[end DRAWING format]
|
||||||
|
[if (size of <marked text (see 3rd note)>) mod 4 !=0]
|
||||||
|
<0x random PAD until (size of <marked text (see 3rd note)>) mod 4 ==0>
|
||||||
|
[fi]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of <marked text (see 3rd note)> >
|
||||||
|
<marked text (see 3rd note)>
|
||||||
|
[if (size of <marked text (see 3rd note)>) mod 4 !=0]
|
||||||
|
<0x random PAD until (size of <marked text (see 3rd note)>) mod 4 ==0>
|
||||||
|
[fi]
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of DATA block>
|
||||||
|
[if EBAR] // this block can appear, or not... ???
|
||||||
|
EBAR
|
||||||
|
...various {4 x byte} ???
|
||||||
|
[fi EBAR]
|
||||||
|
EBVS
|
||||||
|
<0x 00 00 00 03>
|
||||||
|
<0x 4 bytes = IDENTIFIER>
|
||||||
|
[<0x 00 00 00 01>, or nothing at all] ???
|
||||||
|
<0x 00 00 00 08>
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
<0x 00 00 00 10>
|
||||||
|
...(size of DATA block - 30)
|
||||||
|
<0x FD EA = PAD? (ýê)>
|
||||||
|
[fi DRAWING]
|
||||||
|
//-------------------------------
|
||||||
|
[next {NOTE,MARK,CORRECTION,DRAWING}]
|
||||||
|
|
||||||
|
// AUTHOR (if any)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[if AUTHOR]
|
||||||
|
AUTH
|
||||||
|
<0x 4 bytes = size of AUTHOR block>
|
||||||
|
<text (see 3rd note)>
|
||||||
|
[fi AUTHOR]
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// TITLE (if any)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[if TITLE]
|
||||||
|
TITL
|
||||||
|
<0x 4 bytes = size of TITLE block>
|
||||||
|
<text (see 3rd note)>
|
||||||
|
[fi TITLE]
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// GENRE (if any)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[if GENRE]
|
||||||
|
GENR
|
||||||
|
<0x 4 bytes = size of GENRE block>
|
||||||
|
<text (see 3rd note)>
|
||||||
|
[fi GENRE]
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// ABSTRACT (if any)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[if ABSTRACT]
|
||||||
|
ABST
|
||||||
|
<0x 4 bytes = size of ABSTRACT block>
|
||||||
|
<text (see 3rd note)>
|
||||||
|
[fi ABSTRACT]
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
|
||||||
|
// FINAL DATA
|
||||||
|
// Note: 'FINAL DATA' can occur anytime between these marks:
|
||||||
|
// AUTHOR,TITLE,CATEGORY,GENRE,ABSTRACT,COVER,PUBLISHER,...
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
DATA
|
||||||
|
<0x 4 bytes = size of EBVS block>
|
||||||
|
[if EBAR] // this block can appear, or not... ???
|
||||||
|
EBAR
|
||||||
|
...various {4 x byte} ???
|
||||||
|
[fi EBAR]
|
||||||
|
EBVS
|
||||||
|
<0x 00 00 00 03> || <0x 00 00 00 04>
|
||||||
|
<0x 4 bytes || 8 bytes = IDENTIFIER>
|
||||||
|
<0x 00 00 00 08>
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
<0x 00 00 00 00>
|
||||||
|
<0x 00 00 00 10>
|
||||||
|
...(size of EBVS block - 30) :
|
||||||
|
...4 <-- 'position of last read' related
|
||||||
|
...various {4 x byte} ???
|
||||||
|
...4 <-- 'position of last read' related
|
||||||
|
...4
|
||||||
|
...4
|
||||||
|
...4
|
||||||
|
<0x FD EA = PAD? (ýê)>
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
|
||||||
|
// CATEGORY (if any)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[if CATEGORY]
|
||||||
|
CATE
|
||||||
|
<0x 4 bytes = size of CATEGORY block>
|
||||||
|
<text (see 3rd note)>
|
||||||
|
[fi CATEGORY]
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// COVER (if any)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[if COVER]
|
||||||
|
COVE
|
||||||
|
<0x 4 bytes = size of COVER block>
|
||||||
|
<text (see 3rd note)>
|
||||||
|
[fi COVER]
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// PUBLISHER (if any)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[if PUBLISHER]
|
||||||
|
PUBL
|
||||||
|
<0x 4 bytes = size of PUBLISHER block>
|
||||||
|
<text (see 3rd note)>
|
||||||
|
[fi PUBLISHER]
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
// FINAL GROUP OF MARKS
|
||||||
|
// Order of {NOTE,MARK,CORRECTION} :
|
||||||
|
// starts with user data at the begining of the file,
|
||||||
|
// going forwards to the end:
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
[for each {NOTE,MARK,CORRECTION,DRAWING,BOOKMARK}]
|
||||||
|
BKMK
|
||||||
|
<0x 4 bytes = size of BKMK>
|
||||||
|
<0x 4 bytes = TEXT position of the beginning of {NOTE,MARK,CORRECTION,DRAWING,BOOKMARK}>
|
||||||
|
//-------------------------------
|
||||||
|
[if DRAWING]
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
[else]
|
||||||
|
<0x 4 bytes = TEXT position of the end of {NOTE,MARK,CORRECTION,BOOKMARK}>
|
||||||
|
[fi DRAWING]
|
||||||
|
...4
|
||||||
|
...4
|
||||||
|
//-------------------------------
|
||||||
|
[if NOTE]
|
||||||
|
<0x xx xx xx (20)?>, xxxxxx=>RRGGBB color ???
|
||||||
|
<0x 00 00 00 02>
|
||||||
|
[fi NOTE]
|
||||||
|
[if MARK]
|
||||||
|
<0x xx xx xx (0F/00)??>, xxxxxx=>RRGGBB color ???
|
||||||
|
<0x 00 00 00 04>
|
||||||
|
[fi MARK]
|
||||||
|
[if CORRECTION]
|
||||||
|
<0x xx xx xx (6F)?>, xxxxxx=>RRGGBB color ???
|
||||||
|
<0x 00 00 00 02>
|
||||||
|
[fi CORRECTION]
|
||||||
|
[if DRAWING]
|
||||||
|
<0x xx xx xx (0F)?>, xxxxxx=>RRGGBB DRAWING's background color.
|
||||||
|
<0x 00 00 00 08>
|
||||||
|
[fi DRAWING]
|
||||||
|
[if BOOKMARK]
|
||||||
|
<0x xx xx xx 00>
|
||||||
|
<0x 00 00 00 01>
|
||||||
|
[fi BOOKMARK]
|
||||||
|
// this one is a strange type of mark, of yet not identified use:
|
||||||
|
[if UNKNOWN_TYPE_YET_1]
|
||||||
|
<0x xx xx xx 00>
|
||||||
|
<0x 00 00 40 00>
|
||||||
|
[fi UNKNOWN_TYPE_YET_1]
|
||||||
|
|
||||||
|
//-------------------------------
|
||||||
|
[if BOOKMARK || (NOTE "without stored marked text")]
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
[else]
|
||||||
|
<0x 4 bytes = DATA pointer in INDEXES>
|
||||||
|
[fi BOOKMARK]
|
||||||
|
[if DRAWING || MARK]
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
[else]
|
||||||
|
<0x 4 bytes = DATA pointer in INDEXES>
|
||||||
|
[fi]
|
||||||
|
<0x 4 bytes = DATA pointer in INDEXES>
|
||||||
|
[if DRAWING]
|
||||||
|
<0x 4 bytes = DATA pointer in INDEXES>
|
||||||
|
[else]
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
[fi]
|
||||||
|
//-------------------------------
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
[next {NOTE,MARK,CORRECTION,DRAWING,BOOKMARK}]
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
|
||||||
|
[if length % 32 bit != 0] ???
|
||||||
|
<0x FF FF FF FF>
|
||||||
|
[fi]
|
||||||
|
|
||||||
|
// END OF FILE
|
||||||
|
|
||||||
|
// by idleloop@yahoo.com, v0.2.e, 12/2009
|
||||||
|
// http://www.angelfire.com/ego2/idleloop
|
341
format_docs/pdb/mobi.txt
Normal file
341
format_docs/pdb/mobi.txt
Normal file
@ -0,0 +1,341 @@
|
|||||||
|
from (http://wiki.mobileread.com/wiki/MOBI)
|
||||||
|
|
||||||
|
About
|
||||||
|
-----
|
||||||
|
|
||||||
|
MOBI is the format used by the the MobiPocket Reader. It may have a .mobi
|
||||||
|
extension or it may have a .prc extension. The extension can be changed by the
|
||||||
|
user to either of the accepted forms. In either case it may be DRM protected or
|
||||||
|
non-DRM. The .prc extension is used because the PalmOS doesn't support any file
|
||||||
|
extensions except .prc or .pdb. Note that Mobipocket prohibits their DRM format
|
||||||
|
to be used on dedicated eBook readers that support other DRM formats.
|
||||||
|
|
||||||
|
|
||||||
|
Description
|
||||||
|
-----------
|
||||||
|
|
||||||
|
MOBI format was originally an extension of the PalmDOC format by adding
|
||||||
|
certain HTML like tags to the data. Many MOBI formatted documents still use
|
||||||
|
this form. However there is also a high compression version of this file format
|
||||||
|
that compresses data to a larger degree in a proprietary manner. There are some
|
||||||
|
third party programs that can read the eBooks in the original MOBI format but
|
||||||
|
there are only a few third party program that can read the eBooks in the new
|
||||||
|
compressed form. The higher compression mode is using a huffman coding scheme
|
||||||
|
that has been called the Huff/cdic algorithm.
|
||||||
|
|
||||||
|
From time to time features have been added to the format so new files may have
|
||||||
|
problems if you try and read them with a down level reader. Currently the
|
||||||
|
source files follow the guidelines in the Open eBook format.
|
||||||
|
|
||||||
|
Note that AZW for the Amazon Kindle is the same format as MOBI except that it
|
||||||
|
uses a slightly different DRM scheme.
|
||||||
|
|
||||||
|
|
||||||
|
Format
|
||||||
|
------
|
||||||
|
|
||||||
|
Like PalmDOC, the Mobipocket file format is that of a standard Palm Database
|
||||||
|
Format file. The header of that format includes the name of the database
|
||||||
|
(usually the book title and sometimes a portion of the authors name) which is
|
||||||
|
up to 31 bytes of data. The files are identified as Creator ID of MOBI and a
|
||||||
|
Type of BOOK.
|
||||||
|
|
||||||
|
|
||||||
|
PalmDOC Header
|
||||||
|
--------------
|
||||||
|
|
||||||
|
The first record in the Palm Database Format gives more information about the
|
||||||
|
Mobipocket file. The first 16 bytes are almost identical to the first sixteen
|
||||||
|
bytes of a PalmDOC format file.
|
||||||
|
|
||||||
|
bytes content comments
|
||||||
|
2 Compression 1 == no compression, 2 = PalmDOC compression,
|
||||||
|
17480 = HUFF/CDIC compression.
|
||||||
|
2 Unused Always zero
|
||||||
|
4 text length Uncompressed length of the entire text of the book
|
||||||
|
2 record count Number of PDB records used for the text of the book.
|
||||||
|
2 record size Maximum size of each record containing text, always
|
||||||
|
4096.
|
||||||
|
4 Current Position Current reading position, as an offset into the
|
||||||
|
uncompressed text
|
||||||
|
|
||||||
|
There are two differences from a Palm DOC file. There's an additional
|
||||||
|
compression type (17480), and the Current Position bytes are used for a
|
||||||
|
different purpose:
|
||||||
|
|
||||||
|
bytes content comments
|
||||||
|
2 Encryption Type 0 == no encryption, 1 = Old Mobipocket Encryption,
|
||||||
|
2 = Mobipocket Encryption.
|
||||||
|
2 Unknown Usually zero
|
||||||
|
|
||||||
|
The old Mobipocket Encryption scheme only allows the file to be registered
|
||||||
|
with one PID, unlike the current encryption scheme that allows multiple PIDs to
|
||||||
|
be used in a single file. Unless specifically mentioned, all the encryption
|
||||||
|
information on this page refers to the current scheme.
|
||||||
|
|
||||||
|
|
||||||
|
MOBI Header
|
||||||
|
-----------
|
||||||
|
|
||||||
|
Most Mobipocket file also have a MOBI header in record 0 that follows these
|
||||||
|
16 bytes, and newer formats also have an EXTH header following the MOBI header,
|
||||||
|
again all in record 0 of the PDB file format.
|
||||||
|
|
||||||
|
The MOBI header is of variable length and is not documented. Some fields have
|
||||||
|
been tentatively identified as follows:
|
||||||
|
|
||||||
|
offset bytes content comments
|
||||||
|
16 4 identifier The characters M O B I
|
||||||
|
20 4 header length The length of the MOBI header, including
|
||||||
|
the previous 4 bytes
|
||||||
|
24 4 Mobi type The kind of Mobipocket file this is
|
||||||
|
2 Mobipocket Book
|
||||||
|
3 PalmDoc Book
|
||||||
|
4 Audio
|
||||||
|
257 News
|
||||||
|
258 News_Feed
|
||||||
|
259 News_Magazine
|
||||||
|
513 PICS
|
||||||
|
514 WORD
|
||||||
|
515 XLS
|
||||||
|
516 PPT
|
||||||
|
517 TEXT
|
||||||
|
518 HTML
|
||||||
|
28 4 text Encoding 1252 = CP1252 (WinLatin1); 65001 = UTF-8
|
||||||
|
32 4 Unique-ID Some kind of unique ID number (random?)
|
||||||
|
36 4 Generator version Potentially the version of the
|
||||||
|
Mobipocket-generation tool. Always >=
|
||||||
|
the value of the "format version" field
|
||||||
|
and <= the version of mobigen used to
|
||||||
|
produce the file.
|
||||||
|
40 40 Reserved All 0xFF. In case of a dictionary, or
|
||||||
|
some newer file formats, a few bytes are
|
||||||
|
used from this range of 40 0xFFs
|
||||||
|
80 4 First Non-book index? First record number (starting with 0)
|
||||||
|
that's not the book's text
|
||||||
|
84 4 Full Name Offset Offset in record 0 (not from start of
|
||||||
|
file) of the full name of the book
|
||||||
|
88 4 Full Name Length Length in bytes of the full name of the
|
||||||
|
book
|
||||||
|
92 4 Language Book language code. Low byte is main
|
||||||
|
language 09= English, next byte is
|
||||||
|
dialect, 08 = British, 04 = US
|
||||||
|
96 4 Input Language Input language for a dictionary
|
||||||
|
100 4 Output Language Output language for a dictionary
|
||||||
|
104 4 Format version Potentially the version of the
|
||||||
|
Mobipocket format used in this file.
|
||||||
|
Always >= 1 and <= the value of the
|
||||||
|
"generator version" field.
|
||||||
|
108 4 First Image record First record number (starting with 0)
|
||||||
|
that contains an image. Image records
|
||||||
|
should be sequential. If there are
|
||||||
|
no images this will be 0xffffffff.
|
||||||
|
112 4 HUFF record Record containing Huff information
|
||||||
|
used in HUFF/CDIC decompression.
|
||||||
|
116 4 HUFF count Number of Huff records.
|
||||||
|
122 4 DATP record Unknown: Records starts with DATP.
|
||||||
|
124 4 DATP count Number of DATP records.
|
||||||
|
128 4 EXTH flags Bitfield. if bit 6, 0x40 is set, then
|
||||||
|
there's an EXTH record
|
||||||
|
The following records are only present if the mobi header is long enough.
|
||||||
|
132 36 ? 32 unknown bytes, if MOBI is long enough
|
||||||
|
168 4 DRM Offset Offset to DRM key info in DRMed files.
|
||||||
|
0xFFFFFFFF if no DRM
|
||||||
|
172 4 DRM Count Number of entries in DRM info.
|
||||||
|
174 4 DRM Size Number of bytes in DRM info.
|
||||||
|
176 4 DRM Flags Some flags concerning the DRM info.
|
||||||
|
180 6 ?
|
||||||
|
186 2 Last Image record Possible vaule with the last image
|
||||||
|
record. If there are no images in the
|
||||||
|
book this will be 0xffff.
|
||||||
|
188 4 ?
|
||||||
|
192 4 FCIS record Unknown. Record starts with FCIS.
|
||||||
|
196 4 ?
|
||||||
|
200 4 FLIS record Unknown. Records starts with FLIS.
|
||||||
|
204 ? ? Bytes to the end of the MOBI header,
|
||||||
|
including the following if the header
|
||||||
|
length >= 228. ( 244 from start of
|
||||||
|
record)
|
||||||
|
242 2 Extra Data Flags A set of binary flags, some of which
|
||||||
|
indicate extra data at the end of each
|
||||||
|
text block. This only seems to be valid
|
||||||
|
for Mobipocket format version 5 and 6
|
||||||
|
(and higher?), when the header length
|
||||||
|
is 228 (0xE4) or 232 (0xE8).
|
||||||
|
|
||||||
|
|
||||||
|
EXTH Header
|
||||||
|
-----------
|
||||||
|
|
||||||
|
If the MOBI header indicates that there's an EXTH header, it follows immediately
|
||||||
|
after the MOBI header. since the MOBI header is of variable length, this isn't
|
||||||
|
at any fixed offset in record 0. Note that some readers will ignore any EXTH
|
||||||
|
header info if the mobipocket version number specified in the MOBI header is 2
|
||||||
|
or less (perhaps 3 or less).
|
||||||
|
|
||||||
|
The EXTH header is also undocumented, so some of this is guesswork.
|
||||||
|
|
||||||
|
bytes content comments
|
||||||
|
4 identifier the characters E X T H
|
||||||
|
4 header length the length of the EXTH header, including the previous 4 bytes
|
||||||
|
4 record Count The number of records in the EXTH header. the rest of the EXTH header consists of repeated EXTH records to the end of the EXTH length.
|
||||||
|
EXTH record start Repeat until done.
|
||||||
|
4 record type Exth Record type. Just a number identifying what's stored in the record
|
||||||
|
4 record length length of EXTH record = L , including the 8 bytes in the type and length fields
|
||||||
|
L-8 record data Data.
|
||||||
|
EXTH record end Repeat until done.
|
||||||
|
|
||||||
|
There are lots of different EXTH Records types. Ones found so far in Mobipocket
|
||||||
|
files are listed here, with possible meanings. Hopefully the table will be
|
||||||
|
filled in as more information comes to light.
|
||||||
|
|
||||||
|
record type usual length name comments
|
||||||
|
1 drm_server_id
|
||||||
|
2 drm_commerce_id
|
||||||
|
3 drm_ebookbase_book_id
|
||||||
|
100 author
|
||||||
|
101 publisher
|
||||||
|
102 imprint
|
||||||
|
103 description
|
||||||
|
104 isbn
|
||||||
|
105 subject
|
||||||
|
106 publishingdate
|
||||||
|
107 review
|
||||||
|
108 contributor
|
||||||
|
109 rights
|
||||||
|
110 subjectcode
|
||||||
|
111 type
|
||||||
|
112 source
|
||||||
|
113 asin
|
||||||
|
114 versionnumber
|
||||||
|
115 sample
|
||||||
|
116 startreading
|
||||||
|
118 retail price (as text)
|
||||||
|
119 retail price currency (as text)
|
||||||
|
201 coveroffset
|
||||||
|
202 thumboffset
|
||||||
|
203 hasfakecover
|
||||||
|
204 204 Unknown
|
||||||
|
205 205 Unknown
|
||||||
|
206 206 Unknown
|
||||||
|
207 207 Unknown
|
||||||
|
208 208 Unknown
|
||||||
|
300 300 Unknown
|
||||||
|
401 clippinglimit
|
||||||
|
402 publisherlimit
|
||||||
|
403 403 Unknown
|
||||||
|
404 404 ttsflag
|
||||||
|
501 4 cdetype PDOC - Personal Doc;
|
||||||
|
EBOK - ebook;
|
||||||
|
502 lastupdatetime
|
||||||
|
503 updatedtitle
|
||||||
|
|
||||||
|
And now, at the end of Record 0 of the PDB file format, we usually get the full
|
||||||
|
file name, the offset of which is given in the MOBI header.
|
||||||
|
|
||||||
|
|
||||||
|
Variable-width integers
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
Some parts of the Mobipocket format encode data as variable-width integers.
|
||||||
|
These integers are represented big-endian with 7 bits per byte in bits 1-7. They
|
||||||
|
may be either forward-encoded, in which case only the LSB has bit 8 set, or
|
||||||
|
backward-encoded, in which case only the MSB has bit 8 set. For example, the
|
||||||
|
number 0x11111 would be represented forward-encoded as:
|
||||||
|
|
||||||
|
0x04 0x22 0x91
|
||||||
|
|
||||||
|
And backward-encoded as:
|
||||||
|
|
||||||
|
0x84 0x22 0x11
|
||||||
|
|
||||||
|
|
||||||
|
Trailing entries
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The Extra Data Flags field of the MOBI header indicates which, if any, trailing
|
||||||
|
entries are appended to the end of each text record. Each set bit in the field
|
||||||
|
indicates a trailing entry. The entries appear to occur in bit-order; e.g.,
|
||||||
|
trailing entry 1 immediately follows the text content and entry 16 occurs at
|
||||||
|
the very end of the record. The effect and exact details of most of these
|
||||||
|
entries is unknown. The trailing entries indicated by bits 2-16 appear to
|
||||||
|
follow a common format. That format is:
|
||||||
|
|
||||||
|
<data><size>
|
||||||
|
|
||||||
|
Where <size> is the size of the entire trailing entry (including the size of
|
||||||
|
<size>) as a backward-encoded Mobipocket variable-width integer.
|
||||||
|
|
||||||
|
Only a few bits have been identified
|
||||||
|
|
||||||
|
bit Data at end of records
|
||||||
|
0x0001 Multi-byte character overlaps
|
||||||
|
0x0002 Some data to help with indexing
|
||||||
|
0x0004 Some data about uncrossable breaks
|
||||||
|
|
||||||
|
|
||||||
|
Multibyte character overlap
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
When bit 1 of the Extra Data Flags field is set, each record is followed by a
|
||||||
|
trailing entry containing any extra bytes necessary to complete a multibyte
|
||||||
|
character which crosses the record boundary. The bytes do not participate in
|
||||||
|
compression regardless which compression scheme is used for the file. However,
|
||||||
|
unlike the trailing data bytes, the multibytes (including the count byte) do
|
||||||
|
get included in any encryption. The overlapping bytes then re-appear as normal
|
||||||
|
content at the beginning of the following record. The trailing entry ends with
|
||||||
|
a byte containing a count of the overlapping bytes plus additional flags.
|
||||||
|
|
||||||
|
offset bytes content comments
|
||||||
|
0 0-3 N terminal bytes
|
||||||
|
of a multibyte
|
||||||
|
character
|
||||||
|
N 1 Size & flags bits 1-2 encode N, use of bits 3-8 is unknown
|
||||||
|
|
||||||
|
|
||||||
|
PalmDOC Compression
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
PalmDOC uses LZ77 compression techniques. DOC files can contain only compressed
|
||||||
|
text. The format does not allow for any text formatting. This keeps files small,
|
||||||
|
in keeping with the Palm philosophy. However, extensions to the format can use
|
||||||
|
tags, such as HTML or PML, to include formatting within text. These extensions
|
||||||
|
to PalmDoc are not interchangeable and are the basis for most eBook Reader
|
||||||
|
formats on Palm devices.
|
||||||
|
|
||||||
|
LZ77 algorithms achieve compression by replacing portions of the data with
|
||||||
|
references to matching data that has already passed through both encoder and
|
||||||
|
decoder. A match is encoded by a pair of numbers called a length-distance pair,
|
||||||
|
which is equivalent to the statement "each of the next length characters is
|
||||||
|
equal to the character exactly distance characters behind it in the uncompressed
|
||||||
|
stream." (The "distance" is sometimes called the "offset" instead.)
|
||||||
|
|
||||||
|
In the PalmDoc format, a length-distance pair is always encoded by a two-byte
|
||||||
|
sequence. Of the 16 bits that make up these two bytes, 11 bits go to encoding
|
||||||
|
the distance, 3 go to encoding the length, and the remaining two are used to
|
||||||
|
make sure the decoder can identify the first byte as the beginning of such a
|
||||||
|
two-byte sequence. The exact alforithm needed to decode the compressed text can
|
||||||
|
be found on the PalmDOC page.
|
||||||
|
|
||||||
|
PalmDOC data is always divided into 4096 byte blocks and the blocks are acted
|
||||||
|
upon independently.
|
||||||
|
|
||||||
|
PalmDOC does have support for bookmarks. These pointers are named and refer to
|
||||||
|
an offset location in a file. If the file is edited these locations may no
|
||||||
|
longer refer to the correct locations. Some reading programs allow the user to
|
||||||
|
enter or edit these bookmarks while others treat them as a TOC. Some reading
|
||||||
|
programs may ignore them entirely. They are stored at the end of the file itself
|
||||||
|
so the full file needs to be scanned when loaded to find them.
|
||||||
|
|
||||||
|
|
||||||
|
MBP
|
||||||
|
---
|
||||||
|
|
||||||
|
This is the extension used on a side file (auxiliary) for MOBI formatted eBooks.
|
||||||
|
It is used to store metadata used by the library software and also to store
|
||||||
|
user entered data like bookmarks, annotations, last read position. This file is
|
||||||
|
created automatically by the reader program when the eBook is first opened and
|
||||||
|
has a .mbp extension. The Library management software in MobiPocket uses this
|
||||||
|
file to get information displayed in the library window such as title and author
|
||||||
|
so that it won't have to open the larger eBook file.
|
||||||
|
|
25
format_docs/pdb/palmdoc.txt
Normal file
25
format_docs/pdb/palmdoc.txt
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
PalmDoc Format
|
||||||
|
--------------
|
||||||
|
|
||||||
|
The format is that of a standard Palm Database Format file. The header of that
|
||||||
|
format includes the name of the database (usually the book title and sometimes
|
||||||
|
a portion of the authors name) which is up to 31 bytes of data. This string of
|
||||||
|
characters is terminated with a 0 in the C style. The files are identified as
|
||||||
|
Creator ID of REAd and a Type of TEXt.
|
||||||
|
|
||||||
|
|
||||||
|
Record 0
|
||||||
|
--------
|
||||||
|
|
||||||
|
The first record in the Palm Database Format gives more information about the
|
||||||
|
PalmDOC file, and contains 16 bytes.
|
||||||
|
|
||||||
|
bytes content comments
|
||||||
|
|
||||||
|
2 Compression 1 == no compression, 2 = PalmDOC compression (see below)
|
||||||
|
2 Unused Always zero
|
||||||
|
4 text length Uncompressed length of the entire text of the book
|
||||||
|
2 record count Number of PDB records used for the text of the book.
|
||||||
|
2 record size Maximum size of each record containing text, always 4096
|
||||||
|
4 Current Position Current reading position, as an offset into the uncompressed text
|
||||||
|
|
104
format_docs/pdb/pdb_format.txt
Normal file
104
format_docs/pdb/pdb_format.txt
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
Format
|
||||||
|
------
|
||||||
|
|
||||||
|
A PDB file can be borken into multiple parts. The header, record 0 and data.
|
||||||
|
values stored within the various parts are big-endian byte order. The data
|
||||||
|
part is is broken down into multiple sections. The section count and offsets
|
||||||
|
are referened in the PDB header. Sections can be no more than 65505 bytes in
|
||||||
|
length.
|
||||||
|
|
||||||
|
|
||||||
|
Layout
|
||||||
|
------
|
||||||
|
|
||||||
|
PDB files take the format: DB header followed by the record 0 which has
|
||||||
|
contained format specific iformation followed by data.
|
||||||
|
|
||||||
|
DB Header
|
||||||
|
0 Record 0
|
||||||
|
.
|
||||||
|
. Data (borken down into sections)
|
||||||
|
.
|
||||||
|
|
||||||
|
|
||||||
|
Palm Database Header Format
|
||||||
|
|
||||||
|
bytes content comments
|
||||||
|
|
||||||
|
32 name database name. This name is 0 terminated in the
|
||||||
|
field and will be used as the file name on a
|
||||||
|
computer. For eBooks this usually contains the
|
||||||
|
title and may have the author depending on the
|
||||||
|
length available.
|
||||||
|
|
||||||
|
2 attributes bit field.
|
||||||
|
0x0002 Read-Only
|
||||||
|
0x0004 Dirty AppInfoArea
|
||||||
|
0x0008 Backup this database (i.e. no conduit exists)
|
||||||
|
0x0010 (16 decimal) Okay to install newer over
|
||||||
|
existing copy, if present on PalmPilot
|
||||||
|
0x0020 (32 decimal) Force the PalmPilot to reset
|
||||||
|
after this database is installed
|
||||||
|
0x0040 (64 decimal) Don't allow copy of file to be
|
||||||
|
beamed to other Pilot.
|
||||||
|
|
||||||
|
2 version file version
|
||||||
|
|
||||||
|
4 creation date No. of seconds since start of January 1, 1904.
|
||||||
|
|
||||||
|
4 modification date No. of seconds since start of January 1, 1904.
|
||||||
|
|
||||||
|
4 last backup date No. of seconds since start of January 1, 1904.
|
||||||
|
|
||||||
|
4 modificationNumber
|
||||||
|
|
||||||
|
4 appInfoID offset to start of Application Info (if present)
|
||||||
|
or null
|
||||||
|
|
||||||
|
4 sortInfoID offset to start of Sort Info (if present) or null
|
||||||
|
|
||||||
|
4 type See above table. (For Applications this data will
|
||||||
|
be 'appl')
|
||||||
|
|
||||||
|
4 creator See above table. This program will be launched if
|
||||||
|
the file is tapped
|
||||||
|
|
||||||
|
4 uniqueIDseed used internally to identify record
|
||||||
|
|
||||||
|
4 nextRecordListID Only used when in-memory on Palm OS. Always set to
|
||||||
|
zero in stored files.
|
||||||
|
|
||||||
|
2 number of Records number of records in the file - N
|
||||||
|
|
||||||
|
8N record Info List
|
||||||
|
|
||||||
|
start of record
|
||||||
|
info entry Repeat N times to end of record info entry
|
||||||
|
|
||||||
|
4 record Data Offset the offset from the start of the PDB of this record
|
||||||
|
|
||||||
|
1 record Attributes bit field. The least significant four bits are used
|
||||||
|
to represent the category values. These are the
|
||||||
|
categories used to split the databases for viewing
|
||||||
|
on the screen. A few of the 16 categories are
|
||||||
|
pre-defined but the user can add their own. There
|
||||||
|
is an undefined category for use if the user or
|
||||||
|
programmer hasn't set this.
|
||||||
|
0x10 (16 decimal) Secret record bit.
|
||||||
|
0x20 (32 decimal) Record in use (busy bit).
|
||||||
|
0x40 (64 decimal) Dirty record bit.
|
||||||
|
0x80 (128, unsigned decimal) Delete record on
|
||||||
|
next HotSync.
|
||||||
|
|
||||||
|
3 UniqueID The unique ID for this record. Often just a
|
||||||
|
sequential count from 0
|
||||||
|
|
||||||
|
end of record
|
||||||
|
info entry
|
||||||
|
|
||||||
|
2? Gap to data traditionally 2 zero bytes to Info or raw data
|
||||||
|
|
||||||
|
? Records The actual data in the file. AppInfoArea (if
|
||||||
|
present), SortInfoArea (if present) and then
|
||||||
|
records sequentially
|
||||||
|
|
34
format_docs/pdb/pdb_types.txt
Normal file
34
format_docs/pdb/pdb_types.txt
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
Palm Database File Code
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
Reader Type Code
|
||||||
|
|
||||||
|
Adobe Reader .pdfADBE
|
||||||
|
PalmDOC TEXtREAd
|
||||||
|
BDicty BVokBDIC
|
||||||
|
DB (Database program) DB99DBOS
|
||||||
|
eReader PNRdPPrs
|
||||||
|
eReader DataPPrs
|
||||||
|
FireViewer (ImageViewer) vIMGView
|
||||||
|
HanDBase PmDBPmDB
|
||||||
|
InfoView InfoINDB
|
||||||
|
iSilo ToGoToGo
|
||||||
|
iSilo 3 SDocSilX
|
||||||
|
JFile JbDbJBas
|
||||||
|
JFile Pro JfDbJFil
|
||||||
|
LIST DATALSdb
|
||||||
|
MobileDB Mdb1Mdb1
|
||||||
|
MobiPocket BOOKMOBI
|
||||||
|
Plucker DataPlkr
|
||||||
|
QuickSheet DataSprd
|
||||||
|
SuperMemo SM01SMem
|
||||||
|
TealDoc TEXtTlDc
|
||||||
|
TealInfo InfoTlIf
|
||||||
|
TealMeal DataTlMl
|
||||||
|
TealPaint DataTlPt
|
||||||
|
ThinkDB dataTDBP
|
||||||
|
Tides TdatTide
|
||||||
|
TomeRaider ToRaTRPW
|
||||||
|
Weasel zTXTGPlm
|
||||||
|
WordSmith BDOCWrdS
|
||||||
|
|
2122
format_docs/pdb/plucker.html
Normal file
2122
format_docs/pdb/plucker.html
Normal file
File diff suppressed because it is too large
Load Diff
936
format_docs/pdb/pml.txt
Normal file
936
format_docs/pdb/pml.txt
Normal file
@ -0,0 +1,936 @@
|
|||||||
|
Palm Markup Language
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
This page explains how to use the Palm Markup Language (PML) to specify
|
||||||
|
formatting and other information in a text file for later reading using the
|
||||||
|
eReader.
|
||||||
|
|
||||||
|
PML commands start with a backslash, "\", and usually consist of a single
|
||||||
|
character after that. Some PML commands are paired, such as those that specify
|
||||||
|
italicized text. Other commands are directives, such as the "\p", which
|
||||||
|
specifies a page break. PML is not meant to be an industrial-strength markup
|
||||||
|
language, but it is easy to understand, easy to parse, and creates high-quality
|
||||||
|
electronic books.
|
||||||
|
|
||||||
|
Since PML and Palm DropBook are not without flaws, there is a page of Tips and
|
||||||
|
Pitfalls.
|
||||||
|
|
||||||
|
|
||||||
|
Let's Dive Right In
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
palmsample.txt contains examples of formatting text, specifying chapters, etc.
|
||||||
|
Use it to start from, or just as an example when making your own books.
|
||||||
|
|
||||||
|
The following table specifies the Palm Markup Language commands, and what
|
||||||
|
they do.
|
||||||
|
|
||||||
|
\p New page
|
||||||
|
\x New chapter; also causes a new page break.
|
||||||
|
Enclose chapter title (and any style codes)
|
||||||
|
with \x and \x
|
||||||
|
\Xn New chapter, indented n levels (n between 0 and
|
||||||
|
4 inclusive) in the Chapter dialog; doesn't
|
||||||
|
cause a page break. Enclose chapter title (and
|
||||||
|
any style codes) with \Xn and \Xn
|
||||||
|
\Cn="Chapter title" Insert "Chapter title" into the chapter
|
||||||
|
listing, with level n (like \Xn). The text is
|
||||||
|
not shown on the page and does not force a page
|
||||||
|
break. This can sometimes be useful to insert a
|
||||||
|
chapter mark at the beginning of an
|
||||||
|
introduction to the chapter, for example.
|
||||||
|
\c Center this block of text; close with \c on
|
||||||
|
beginning of line
|
||||||
|
\r Right justify text block; close with \r on
|
||||||
|
beginning of line
|
||||||
|
\i Italicize block; close with \i
|
||||||
|
\u Underline block; close with \u
|
||||||
|
\o Overstrike block; close with \o
|
||||||
|
\v Invisible text; close with \v (can be used for
|
||||||
|
comments)
|
||||||
|
\t Indent block. Start at beginning of a line,
|
||||||
|
close with \t at end of a line
|
||||||
|
\T="50%" Indents the specified percentage of the screen
|
||||||
|
width, 50% in this case. If the current drawing
|
||||||
|
position is already past the specified screen
|
||||||
|
location, this tag is ignored.
|
||||||
|
\w="50%" Embed a horizontal rule of a given percentage
|
||||||
|
width of the screen, in this case 50%. This tag
|
||||||
|
causes a line break before and after it. The
|
||||||
|
rule is centered. The percent sign is mandatory.
|
||||||
|
\n Switch to the "normal" font, which is specified
|
||||||
|
by the user
|
||||||
|
\s Switch to stdFont; close with \s to revert to
|
||||||
|
normal font
|
||||||
|
\b Switch to boldFont; close with \b to revert to
|
||||||
|
normal font (deprecated; use \B instead)
|
||||||
|
\l Switch to largeFont; close with \l to revert to
|
||||||
|
normal font
|
||||||
|
\B Mark text as bold. Unlike the \b tag, \B
|
||||||
|
doesn't change the font, so you can have large
|
||||||
|
bold text. You cannot mix \b and \B in the same
|
||||||
|
PML file.
|
||||||
|
\Sp Mark text as superscript. Should not be mixed
|
||||||
|
with other styles such as bold, italic, etc.
|
||||||
|
Enclose superscripted text with \Sp.
|
||||||
|
\Sb Mark text as subscript. Should not be mixed
|
||||||
|
with other styles such as bold, italic, etc.
|
||||||
|
Enclose subscripted text with \Sb.
|
||||||
|
\k Make enclosed text into small-caps; close with
|
||||||
|
\k. Any characters enclosed in \k tags
|
||||||
|
(including those with accents) are made
|
||||||
|
uppercase and are rendered at a smaller point
|
||||||
|
size than a regular uppercase character.
|
||||||
|
\\ Represents a single backslash
|
||||||
|
\aXXX Insert non-ASCII character whose Windows 1252
|
||||||
|
code is decimal XXX. See the PML character
|
||||||
|
table for details.
|
||||||
|
\UXXXX Insert non-ASCII character whose Unicode code
|
||||||
|
is hexidecimal XXXX. See the Extended PML
|
||||||
|
character table for details.
|
||||||
|
\m="imagename.png" Insert the named image. See the section on
|
||||||
|
Images below.
|
||||||
|
\q="#linkanchor"Some text\q Reference a link anchor which is at another
|
||||||
|
spot in the document. The string after the
|
||||||
|
anchor specification and before the trailing\q
|
||||||
|
is underlined or otherwise shown to be a link
|
||||||
|
when viewing the document.
|
||||||
|
\Q="linkanchor" Specify a link anchor in the document.
|
||||||
|
\- Insert a soft hyphen. A soft hyphen shows up
|
||||||
|
only if it is necessary to break a word across
|
||||||
|
a line.
|
||||||
|
\Fn="footnote1"1\Fn Link the "1" to a footnote whose name is
|
||||||
|
footnote1, tagged at the end of the PML
|
||||||
|
document. See the section on Footnotes and
|
||||||
|
Sidebars below.
|
||||||
|
\Sd="sidebar1"Sidebar\Sd Link the "Sidebar" text to a sidebar whose name
|
||||||
|
is sidebar1, tagged at the end of the PML
|
||||||
|
document. See the section on Footnotes and
|
||||||
|
Sidebars below.
|
||||||
|
\I Mark as a reference index item. Enclose index
|
||||||
|
item (and any style codes) with \I and \I. See
|
||||||
|
Creating Dictionaries for more information.
|
||||||
|
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
\pThis is a new page
|
||||||
|
|
||||||
|
\xChapter III\x
|
||||||
|
|
||||||
|
\X1Chapter III, part A\X1
|
||||||
|
|
||||||
|
\p\C="Introduction"The following story is one of my favorites...
|
||||||
|
|
||||||
|
\cProperty of
|
||||||
|
Gateway Senior High School
|
||||||
|
\c
|
||||||
|
|
||||||
|
\rJustify my love
|
||||||
|
\r
|
||||||
|
|
||||||
|
This stuff is \ireally\i cool.
|
||||||
|
|
||||||
|
I just read \uMoby Dick.\u
|
||||||
|
|
||||||
|
This is a \obig\o mistake.
|
||||||
|
|
||||||
|
Copyright 1917\v Date of magazine serialization \v
|
||||||
|
|
||||||
|
\tOnce upon a time
|
||||||
|
there was a wicked queen
|
||||||
|
called Esmerelda.\t
|
||||||
|
|
||||||
|
Mammals:\T="40%"Lions
|
||||||
|
\T="40%"Tigers
|
||||||
|
\T="40%"Bears
|
||||||
|
|
||||||
|
He walked away.
|
||||||
|
\w="80%"
|
||||||
|
Later that day, he ran into an old friend.
|
||||||
|
|
||||||
|
\nIn the normal ways...
|
||||||
|
|
||||||
|
The \stitle page\s should be formatted...
|
||||||
|
|
||||||
|
I just \bcan't\b believe that you...
|
||||||
|
|
||||||
|
This \lREALLY\l is a large tiger...
|
||||||
|
|
||||||
|
This \Bbold\B text can be either \l\Blarge bold\B\l or \s\Bsmall bold\B\s.
|
||||||
|
|
||||||
|
e\Spx + 2\Sp = 9
|
||||||
|
|
||||||
|
C\Sb2\SbH\Sb3\SbO\Sb2\Sb should be used in moderation.
|
||||||
|
|
||||||
|
See also \kanteater\k.
|
||||||
|
|
||||||
|
The DOS prompt said "C:\\windows\\"
|
||||||
|
|
||||||
|
The man said \a147Yeah.\a148
|
||||||
|
|
||||||
|
Arrows can point \U2190 left or right \U2192.
|
||||||
|
|
||||||
|
A Yield sign looks like this: \m="yieldsign.png".
|
||||||
|
|
||||||
|
See the \q="#detailedinstructions"Detailed Instructions\q for how to install your eBook.
|
||||||
|
|
||||||
|
\Q="detailedinstructions"\bDetailed Instructions\b - This section
|
||||||
|
describes how to install an eBook to your handheld device.
|
||||||
|
|
||||||
|
Very long words like anti\-dis\-establish\-ment\-arian\-ism may benefit from
|
||||||
|
the use of soft hyphens.
|
||||||
|
|
||||||
|
The Emerson case\Fn="emerson"[1]\Fn will be very important...
|
||||||
|
|
||||||
|
For more information, see the \Sd="moreinfo"sidebar\Sd.
|
||||||
|
|
||||||
|
\I\Baardvark\B\I \in.\i a large burrowing nocturnal mammal that feeds especially on termites and ants
|
||||||
|
|
||||||
|
|
||||||
|
Footnotes and Sidebars
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Footnotes and Sidebars are specified with an XML-like syntax at the end of the
|
||||||
|
PML document. For example,
|
||||||
|
|
||||||
|
<sidebar id="sidebar1">
|
||||||
|
Here's some \itext\i for a sidebar.
|
||||||
|
</sidebar>
|
||||||
|
|
||||||
|
would specify the sidebar to be displayed when the user taps on a sidebar link
|
||||||
|
in the text that was specified using the \Sd tag.
|
||||||
|
|
||||||
|
Any text or PML placed after the first footnote or sidebar is ignored as part
|
||||||
|
of the book text.
|
||||||
|
|
||||||
|
Sidebars and footnotes can include most PML features, but there are some PML
|
||||||
|
tags that cannot be used inside of a sidebar or footnote.
|
||||||
|
|
||||||
|
These include
|
||||||
|
Chapters \x, \X, \C
|
||||||
|
Links \q, \Q
|
||||||
|
Footnotes \Fn
|
||||||
|
Sidebars \Sd
|
||||||
|
|
||||||
|
See the palmsample.txt file for examples of how to use many of the PML tags.
|
||||||
|
|
||||||
|
|
||||||
|
Images
|
||||||
|
------
|
||||||
|
|
||||||
|
The following rules are intended to guarantee that images in your eBook will be
|
||||||
|
viewable on all platforms that eReader runs on.
|
||||||
|
|
||||||
|
On low-resolution Palm OS handhelds, an image wider than 158 pixels or taller
|
||||||
|
than 148 pixels will be represented in the text by a thumbnail that the user
|
||||||
|
can tap to view the entire image. Images smaller than 158 x 148 will be
|
||||||
|
presented in-line with the text.
|
||||||
|
|
||||||
|
On high-resolution Palm OS handhelds (those having screens of 320x320 pixels or
|
||||||
|
more), images smaller than 158 by 148 pixels will be pixel-doubled. Images
|
||||||
|
larger than 158x148 may be shown in-line with the text, if they will fit on
|
||||||
|
the screen.
|
||||||
|
|
||||||
|
On non-Palm OS platforms, small images will be scaled up appropriately. Large
|
||||||
|
images will be scaled down to fit on the page; in this case the user can tap on
|
||||||
|
the image to view the entire image and zoom in or out.
|
||||||
|
|
||||||
|
For DropBook to find the image, it must be present in a directory whose name
|
||||||
|
matches that of the PML text file. For example, if "pmlsample.txt" contains a
|
||||||
|
reference to an image called "intro.png", then there must be a directory called
|
||||||
|
"pmlsample_img" that contains intro.png. The directory's name is the name of
|
||||||
|
the PML file (without the .txt extension) with "_img" appended.
|
||||||
|
|
||||||
|
Images must be in PNG format and cannot be filtered or interlaced. Image depth
|
||||||
|
must be 8 bits or less. Any color table may be used for color images.
|
||||||
|
|
||||||
|
Image files must be less than or equal to 65505 bytes in size, since they are
|
||||||
|
embedded into the .pdb format of the book; Palm database records are limited to
|
||||||
|
65505 bytes in length. Since images are compressed, the actual image displayed
|
||||||
|
by the reader may be much larger than 64K.
|
||||||
|
|
||||||
|
Any or all of these restrictions may eventually be removed.
|
||||||
|
|
||||||
|
|
||||||
|
Adding a Title, Cover Art, and Other Meta-information to Your eBook
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
|
||||||
|
DropBook normally presents a dialog in which the title and other information
|
||||||
|
for the eBook may be specified. This information may be embedded in the PML
|
||||||
|
file instead.
|
||||||
|
|
||||||
|
To specify the eBook title as it will appear in the Open dialog on the
|
||||||
|
handheld, place a block of invisible comment text at the beginning of the file
|
||||||
|
using \v tags. Inside this comment block, put the string TITLE="My eBook",
|
||||||
|
where "My eBook" is replaced with the name of your eBook. It should look
|
||||||
|
something like this:
|
||||||
|
|
||||||
|
\vTITLE="Palm Sample Document"\v
|
||||||
|
|
||||||
|
You can also specify the author using the AUTHOR meta-tag, the publisher with
|
||||||
|
PUBLISHER, copyright information with COPYRIGHT, and the eBook ISBN with EISBN.
|
||||||
|
A fully-specified set of meta-information might appear in PML as:
|
||||||
|
|
||||||
|
\vTITLE="Palm Sample Document" AUTHOR="Sam Morgenstern" PUBLISHER="eReader.com"
|
||||||
|
EISBN="X-XXXX-XXXX" COPYRIGHT="Copyright \a169 2004 by Sam Morgenstern"\v
|
||||||
|
|
||||||
|
Cover art: If an image named "cover.png" is present in the eBook, it is assumed
|
||||||
|
to be the cover art for the eBook. See the rules for images for sizing and
|
||||||
|
other information.
|
||||||
|
|
||||||
|
Some or all of this information may appear in the book information dialog in
|
||||||
|
eReader, and may be used for other purposes in future products.
|
||||||
|
|
||||||
|
|
||||||
|
Creating Dictionaries
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
The \I PML tag is used to delimit an index item. Example: \Iaardvark\I
|
||||||
|
|
||||||
|
Each entry must start in the normal font. If DropBook shows an error beginning
|
||||||
|
with "No styles permitted before...", there is probably a missing end style tag
|
||||||
|
before the text shown in the error message.
|
||||||
|
|
||||||
|
Links, chapters and other PML structures are not permitted in dictionaries.
|
||||||
|
Images, however, are.
|
||||||
|
|
||||||
|
A special dictionary entry, "(Front matter)" is shown before other entries in
|
||||||
|
the list of entries, and should be used to include pronunciation symbols and
|
||||||
|
other front matter.
|
||||||
|
|
||||||
|
Note that use of dictionaries requires eReader Pro.
|
||||||
|
|
||||||
|
|
||||||
|
Tips and Pitfalls
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
This page explains some common mistakes, some bugs in DropBook and/or the
|
||||||
|
eReader, and some techniques that will allow you to create quality electronic
|
||||||
|
books for the eReader.
|
||||||
|
|
||||||
|
* Check out the Converting to Palm eBooks page for some pointers on
|
||||||
|
converting text from various formats into the Palm Markup Language.
|
||||||
|
* Use a return at the end of each paragraph, not each line.
|
||||||
|
* Using an extra return between paragraphs reads easier than paragraph
|
||||||
|
indentation.
|
||||||
|
* The eReader doesn't display empty lines at the top of a page. If you need
|
||||||
|
to have some "empty" lines at the top of a page, put a space on each line.
|
||||||
|
* Don't use tables if you can possibly avoid it.
|
||||||
|
|
||||||
|
None of the fonts that the eReader supports are monospaced, so tables can
|
||||||
|
be difficult to represent. Break out the information in another way, or
|
||||||
|
use the \T tag, but beware of tables that look great on a Palm OS
|
||||||
|
handheld but not on a Pocket PC or vice versa.
|
||||||
|
|
||||||
|
* The Reader breaks lines on spaces, dashes or underscores. This has
|
||||||
|
several implications.
|
||||||
|
|
||||||
|
1. Don't fill more than a line with spaces, dashes or underscores.
|
||||||
|
There's a bug (which will be fixed in a future release) which
|
||||||
|
causes MakeBook to hang on such a line. Note that in the large
|
||||||
|
font, the number of spaces, dashes or underscores will be much
|
||||||
|
smaller than in the small font.
|
||||||
|
2. A string such as He shouted "Wait!--" may place the last quote on
|
||||||
|
the beginning of a line, since the line would break after the
|
||||||
|
second dash. Prevent this by using the PML string: He shouted
|
||||||
|
"Wait!\a150\a150". The non-breaking dash, code 150, will not break
|
||||||
|
a line. Use \a160 for a non-breaking space. Even better: use \a151,
|
||||||
|
a long dash, instead of two short dashes.
|
||||||
|
|
||||||
|
* The justification codes \c and \r (center and right justification) must
|
||||||
|
have closing codes on the beginning of the line following the justified
|
||||||
|
text.
|
||||||
|
* The indentation tag \t must have a closing tag at the end of a line of
|
||||||
|
the indented text.
|
||||||
|
* Use \s (small font) in the title page(s) of books to force the page(s) to
|
||||||
|
format nicely. Other than that, \n, \s and \l should rarely be necessary;
|
||||||
|
the font size used for most text display should be chosen by the user.
|
||||||
|
|
||||||
|
|
||||||
|
Converting Uncommon Characters to PML
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
|
Use this chart to convert uncommon characters to their Palm Markup Language
|
||||||
|
(PML) equivalent. Most characters are simply represented as themselves in PML
|
||||||
|
and don't require this chart. But some uncommon characters can only be
|
||||||
|
represented in PML by their "\aXXX" syntax. Use this chart to look up that
|
||||||
|
"\aXXX" syntax.
|
||||||
|
|
||||||
|
For Example, if you wanted to write the following phrase in PML:
|
||||||
|
|
||||||
|
Copyright © 1999 by Samuel Morgenstern
|
||||||
|
|
||||||
|
In PML, you would write it as:
|
||||||
|
|
||||||
|
Copyright \a169 1999 by Samuel Morgenstern
|
||||||
|
|
||||||
|
Char HTML # Code HTML Char Code PML Char Code Description
|
||||||
|
|
||||||
|
  - Normal space
|
||||||
|
! ! - ! Exclamation
|
||||||
|
" " " " Double quote
|
||||||
|
# # - # Hash
|
||||||
|
$ $ - $ Dollar
|
||||||
|
% % - % Percent
|
||||||
|
& & & & Ampersand
|
||||||
|
' ' - ' Apostrophe
|
||||||
|
( ( - ( Open bracket
|
||||||
|
) ) - ) Close bracket
|
||||||
|
* * - * Asterisk
|
||||||
|
+ + - + Plus sign
|
||||||
|
, , - , Comma
|
||||||
|
- - - - Minus sign
|
||||||
|
. . - . Period
|
||||||
|
/ / - / Forward slash
|
||||||
|
0 0 - 0 Digit 0
|
||||||
|
1 1 - 1 Digit 1
|
||||||
|
2 2 - 2 Digit 2
|
||||||
|
3 3 - 3 Digit 3
|
||||||
|
4 4 - 4 Digit 4
|
||||||
|
5 5 - 5 Digit 5
|
||||||
|
6 6 - 6 Digit 6
|
||||||
|
7 7 - 7 Digit 7
|
||||||
|
8 8 - 8 Digit 8
|
||||||
|
9 9 - 9 Digit 9
|
||||||
|
: : - : Colon
|
||||||
|
; ; - ; Semicolon
|
||||||
|
< < < Less than
|
||||||
|
= = - = Equals
|
||||||
|
> > > Greater than
|
||||||
|
? ? - ? Question mark
|
||||||
|
@ @ - @ At sign
|
||||||
|
A A - A A
|
||||||
|
B B - B B
|
||||||
|
C C - C C
|
||||||
|
D D - D D
|
||||||
|
E E - E E
|
||||||
|
F F - F F
|
||||||
|
G G - G G
|
||||||
|
H H - H H
|
||||||
|
I I - I I
|
||||||
|
J J - J J
|
||||||
|
K K - K K
|
||||||
|
L L - L L
|
||||||
|
M M - M M
|
||||||
|
N N - N N
|
||||||
|
O O - O O
|
||||||
|
P P - P P
|
||||||
|
Q Q - Q Q
|
||||||
|
R R - R R
|
||||||
|
S S - S S
|
||||||
|
T T - T T
|
||||||
|
U U - U U
|
||||||
|
V V - V V
|
||||||
|
W W - W W
|
||||||
|
X X - X X
|
||||||
|
Y Y - Y Y
|
||||||
|
Z Z - Z Z
|
||||||
|
[ [ - [ Open square bracket
|
||||||
|
\ \ - \\ Backslash
|
||||||
|
] ] - ] Close square bracket
|
||||||
|
^ ^ - ^ Caret
|
||||||
|
_ _ - _ Underscore
|
||||||
|
` ` - ` Grave accent
|
||||||
|
a a - a a
|
||||||
|
b b - b b
|
||||||
|
c c - c c
|
||||||
|
d d - d d
|
||||||
|
e e - e e
|
||||||
|
f f - f f
|
||||||
|
g g - g g
|
||||||
|
h h - h h
|
||||||
|
i i - i i
|
||||||
|
j j - j j
|
||||||
|
k k - k k
|
||||||
|
l l - l l
|
||||||
|
m m - m m
|
||||||
|
n n - n n
|
||||||
|
o o - o o
|
||||||
|
p p - p p
|
||||||
|
q q - q q
|
||||||
|
r r - r r
|
||||||
|
s s - s s
|
||||||
|
t t - t t
|
||||||
|
u u - u u
|
||||||
|
v v - v v
|
||||||
|
w w - w w
|
||||||
|
x x - x x
|
||||||
|
y y - y y
|
||||||
|
z z - z z
|
||||||
|
{ { - { Left brace
|
||||||
|
| | - | Vertical bar
|
||||||
|
} } - } Right brace
|
||||||
|
~ ~ - ~ Tilde
|
||||||
|
|
||||||
|
  \a160 Non-breaking space
|
||||||
|
¡ ¡ \a161 Inverted exclamation
|
||||||
|
¢ ¢ \a162 Cent sign
|
||||||
|
£ £ \a163 Pound sign
|
||||||
|
¤ ¤ \a164 Currency sign
|
||||||
|
¥ ¥ \a165 Yen sign
|
||||||
|
¦ ¦ \a166 Broken bar
|
||||||
|
§ § \a167 Section sign
|
||||||
|
¨ ¨ \a168 Umlaut or diaeresis
|
||||||
|
© © \a169 Copyright sign
|
||||||
|
ª ª \a170 Feminine ordinal
|
||||||
|
« « \a171 Left angle quotes
|
||||||
|
¬ ¬ \a172 Logical not sign
|
||||||
|
­ ­ \a173 Soft hyphen
|
||||||
|
® ® \a174 Registered trademark
|
||||||
|
¯ ¯ \a175 Spacing macron
|
||||||
|
° ° \a176 Degree sign
|
||||||
|
± ± \a177 Plus-minus sign
|
||||||
|
² ² \a178 Superscript 2
|
||||||
|
³ ³ \a179 Superscript 3
|
||||||
|
´ ´ \a180 Spacing acute
|
||||||
|
µ µ \a181 Micro sign
|
||||||
|
¶ ¶ \a182 Paragraph sign
|
||||||
|
· · \a183 Middle dot
|
||||||
|
¸ ¸ \a184 Spacing cedilla
|
||||||
|
¹ ¹ \a185 Superscript 1
|
||||||
|
º º \a186 Masculine ordinal
|
||||||
|
» » \a187 Right angle quotes
|
||||||
|
¼ ¼ \a188 One quarter
|
||||||
|
½ ½ \a189 One half
|
||||||
|
¾ ¾ \a190 Three quarters
|
||||||
|
¿ ¿ \a191 Inverted question mark
|
||||||
|
À À \a192 A grave
|
||||||
|
Á Á \a193 A acute
|
||||||
|
  \a194 A circumflex
|
||||||
|
à à \a195 A tilde
|
||||||
|
Ä Ä \a196 A diaeresis
|
||||||
|
Å Å \a197 A ring
|
||||||
|
Æ &Aelig; \a198 AE ligature
|
||||||
|
Ç Ç \a199 C cedilla
|
||||||
|
È È \a200 E grave
|
||||||
|
É É \a201 E acute
|
||||||
|
Ê Ê \a202 E circumflex
|
||||||
|
Ë Ë \a203 E diaeresis
|
||||||
|
Ì Ì \a204 I grave
|
||||||
|
Í Í \a205 I acute
|
||||||
|
Î Î \a206 I circumflex
|
||||||
|
Ï Ï \a207 I diaeresis
|
||||||
|
Ð Ð \a208 Eth
|
||||||
|
Ñ Ñ \a209 N tilde
|
||||||
|
Ò Ò \a210 O grave
|
||||||
|
Ó Ó \a211 O acute
|
||||||
|
Ô Ô \a212 O circumflex
|
||||||
|
Õ Õ \a213 O tilde
|
||||||
|
Ö Ö \a214 O diaeresis
|
||||||
|
× × \a215 Multiplication sign
|
||||||
|
Ø Ø \a216 O slash
|
||||||
|
Ù Ù \a217 U grave
|
||||||
|
Ú Ú \a218 U acute
|
||||||
|
Û Û \a219 U circumflex
|
||||||
|
Ü Ü \a220 U diaeresis
|
||||||
|
Ý Ý \a221 Y acute
|
||||||
|
Þ Þ \a222 THORN
|
||||||
|
ß ß \a223 sharp s
|
||||||
|
à à \a224 a grave
|
||||||
|
á á \a225 a acute
|
||||||
|
â â \a226 a circumflex
|
||||||
|
ã ã \a227 a tilde
|
||||||
|
ä ä \a228 a diaeresis
|
||||||
|
å å \a229 a ring
|
||||||
|
æ æ \a230 ae ligature
|
||||||
|
ç ç \a231 c cedilla
|
||||||
|
è è \a232 e grave
|
||||||
|
é é \a233 e acute
|
||||||
|
ê ê \a234 e circumflex
|
||||||
|
ë ë \a235 e diaeresis
|
||||||
|
ì ì \a236 i grave
|
||||||
|
í í \a237 i acute
|
||||||
|
î î \a238 i circumflex
|
||||||
|
ï ï \a239 i diaeresis
|
||||||
|
ð ð \a240 eth
|
||||||
|
ñ ñ \a241 n tilde
|
||||||
|
ò ò \a242 o grave
|
||||||
|
ó ó \a243 o acute
|
||||||
|
ô ô \a244 o circumflex
|
||||||
|
õ õ \a245 o tilde
|
||||||
|
ö ö \a246 o diaeresis
|
||||||
|
÷ ÷ \a247 division sign
|
||||||
|
ø ø \a248 o slash
|
||||||
|
ù ù \a249 u grave
|
||||||
|
ú ú \a250 u acute
|
||||||
|
û û \a251 u circumflex
|
||||||
|
ü ü \a252 u diaeresis
|
||||||
|
ý ý \a253 y acute
|
||||||
|
þ þ \a254 thorn
|
||||||
|
ÿ ÿ \a255 y diaeresis
|
||||||
|
, ‚ ‚ \a130 single low quote
|
||||||
|
ƒ ƒ \a131 Scripted f
|
||||||
|
„ „ \a132 low quote
|
||||||
|
… … \a133 Ellipsis
|
||||||
|
† † \a134 Dagger
|
||||||
|
‡ &Dagger \a135 Double dagger
|
||||||
|
Š Š \a138 Large S w/inverted caret
|
||||||
|
< ‹ ‹ \a139 single left angle quote
|
||||||
|
Œ Œ \a140 Large combined oe
|
||||||
|
‘ ‘ \a145 Open single smart quote
|
||||||
|
’ ’ \a146 Close single smart quote
|
||||||
|
“ “ \a147 Open double smart quote
|
||||||
|
” ” \a148 Close double smart quote
|
||||||
|
• • \a149 Bullet
|
||||||
|
– – \a150 Small dash (en dash)
|
||||||
|
— — \a151 Large dash (em dash)
|
||||||
|
™ ™ \a153 Trademark
|
||||||
|
š š \a154 Small S w/inverted caret
|
||||||
|
> › › \a155 single right angle quote
|
||||||
|
œ œ \a156 Small combined oe
|
||||||
|
Ÿ Ÿ \a159 Large Y with diaeresis
|
||||||
|
|
||||||
|
|
||||||
|
Extended Character Set
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
In addition to the special characters supported by earlier versions of eReader
|
||||||
|
(which can be accessed using the \a### tag), all versions of eReader Pro and
|
||||||
|
eReader version 2.4 and later include support for additional special characters
|
||||||
|
and symbols. These symbols can be accessed using the \U#### tag, where #### are
|
||||||
|
four hexidecimal digits giving the Unicode encoding of the special character.
|
||||||
|
|
||||||
|
Only the limited subset of Unicode characters given in the table below are
|
||||||
|
supported. In addition, some of the characters that are included in the table
|
||||||
|
are not present in eReader Pro versions prior to 2.4. To ensure that the
|
||||||
|
characters are displayed correctly, books using these tags should be read using
|
||||||
|
eReader or eReader Pro version 2.4 or later.
|
||||||
|
|
||||||
|
On Palm OS handhelds these special symbols are only available in one size,
|
||||||
|
matching the "Small" font. For best results on Palm OS handhelds the \U tag
|
||||||
|
should only be used inside blocks set to the "Small" font by way of \s tags.
|
||||||
|
On Palm OS handhelds these special characters are not affected by the font tags
|
||||||
|
(\s, \l, \b and \n), the bold style tag (\B), or the small caps style tag (\k).
|
||||||
|
|
||||||
|
If the \U characters are not showing up correctly using eReader on your Windows
|
||||||
|
desktop or laptop this problem is a result of the fonts for eReader not being
|
||||||
|
installed properly. The solution is to go to the directory C:\Windows\Fonts\
|
||||||
|
and "double click" on each font that starts with "Maynard". This will open each
|
||||||
|
font and allow the system to register it. Close the windows that were opened a
|
||||||
|
result of the mouse clicks and the problem should be resolved.
|
||||||
|
|
||||||
|
Char HTML Code PML Code Description
|
||||||
|
|
||||||
|
Latin Extended-A
|
||||||
|
Ā Ā \U0100 LATIN CAPITAL LETTER A WITH MACRON
|
||||||
|
ā ā \U0101 LATIN SMALL LETTER A WITH MACRON
|
||||||
|
Ă Ă \U0102 LATIN CAPITAL LETTER A WITH BREVE
|
||||||
|
ă ă \U0103 LATIN SMALL LETTER A WITH BREVE
|
||||||
|
ą ą \U0105 LATIN SMALL LETTER A WITH OGONEK
|
||||||
|
ć ć \U0107 LATIN SMALL LETTER C WITH ACUTE
|
||||||
|
Č Č \U010C LATIN CAPITAL LETTER C WITH CARON
|
||||||
|
č č \U010D LATIN SMALL LETTER C WITH CARON
|
||||||
|
Ē Ē \U0112 LATIN CAPITAL LETTER E WITH MACRON
|
||||||
|
ē ē \U0113 LATIN SMALL LETTER E WITH MACRON
|
||||||
|
ĕ ĕ \U0115 LATIN SMALL LETTER E WITH BREVE
|
||||||
|
ė ė \U0117 LATIN SMALL LETTER E WITH DOT ABOVE
|
||||||
|
ę ę \U0119 LATIN SMALL LETTER E WITH OGONEK
|
||||||
|
ě ě \U011B LATIN SMALL LETTER E WITH CARON
|
||||||
|
ĝ ĝ \U011D LATIN SMALL LETTER G WITH CIRCUMFLEX
|
||||||
|
ğ ğ \U011F LATIN SMALL LETTER G WITH BREVE
|
||||||
|
Ī Ī \U012A LATIN CAPITAL LETTER I WITH MACRON
|
||||||
|
ī ī \U012B LATIN SMALL LETTER I WITH MACRON
|
||||||
|
ĭ ĭ \U012D LATIN SMALL LETTER I WITH BREVE
|
||||||
|
į į \U012F LATIN SMALL LETTER I WITH OGONEK
|
||||||
|
ı ı \U0131 LATIN SMALL LETTER DOTLESS I
|
||||||
|
Ł Ł \U0141 LATIN CAPITAL LETTER L WITH STROKE
|
||||||
|
ł ł \U0142 LATIN SMALL LETTER L WITH STROKE
|
||||||
|
ń ń \U0144 LATIN SMALL LETTER N WITH ACUTE
|
||||||
|
ň ň \U0148 LATIN SMALL LETTER N WITH CARON
|
||||||
|
ŋ ŋ \U014B LATIN SMALL LETTER ENG
|
||||||
|
Ō Ō \U014C LATIN CAPITAL LETTER O WITH MACRON
|
||||||
|
ō ō \U014D LATIN SMALL LETTER O WITH MACRON
|
||||||
|
ŏ ŏ \U014F LATIN SMALL LETTER O WITH BREVE
|
||||||
|
ő ő \U0151 LATIN SMALL LETTER O WITH DOUBLE ACUTE
|
||||||
|
ŕ ŕ \U0155 LATIN SMALL LETTER R WITH ACUTE
|
||||||
|
ř ř \U0159 LATIN SMALL LETTER R WITH CARON
|
||||||
|
Ś Ś \U015A LATIN CAPITAL LETTER S WITH ACUTE
|
||||||
|
ś ś \U015B LATIN SMALL LETTER S WITH ACUTE
|
||||||
|
ş ş \U015F LATIN SMALL LETTER S WITH CEDILLA
|
||||||
|
ţ ţ \U0163 LATIN SMALL LETTER T WITH CEDILLA
|
||||||
|
ũ ũ \U0169 LATIN SMALL LETTER U WITH TILDE
|
||||||
|
ū ū \U016B LATIN SMALL LETTER U WITH MACRON
|
||||||
|
ŭ ŭ \U016D LATIN SMALL LETTER U WITH BREVE
|
||||||
|
ŷ ŷ \U0177 LATIN SMALL LETTER Y WITH CIRCUMFLEX
|
||||||
|
ź ź \U017A LATIN SMALL LETTER Z WITH ACUTE
|
||||||
|
Ž Ž \U017D LATIN CAPITAL LETTER Z WITH CARON
|
||||||
|
ž ž \U017E LATIN SMALL LETTER Z WITH CARON
|
||||||
|
Latin Extended-B
|
||||||
|
ƿ \U01BF LATIN LETTER WYNN
|
||||||
|
ǎ \U01CE LATIN SMALL LETTER A WITH CARON
|
||||||
|
ǐ \U01D0 LATIN SMALL LETTER I WITH CARON
|
||||||
|
ǒ \U01D2 LATIN SMALL LETTER O WITH CARON
|
||||||
|
ǔ \U01D4 LATIN SMALL LETTER U WITH CARON
|
||||||
|
ǡ \U01E1 LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON
|
||||||
|
ǣ \U01E3 LATIN SMALL LETTER AE WITH MACRON
|
||||||
|
ǧ \U01E7 LATIN SMALL LETTER G WITH CARON
|
||||||
|
ǫ \U01EB LATIN SMALL LETTER O WITH OGONEK
|
||||||
|
ǰ \U01F0 LATIN SMALL LETTER J WITH CARON
|
||||||
|
ȇ \U0207 LATIN SMALL LETTER E WITH INVERTED BREVE
|
||||||
|
ȝ \U021D LATIN SMALL LETTER YOGH
|
||||||
|
ȧ \U0227 LATIN SMALL LETTER A WITH DOT ABOVE
|
||||||
|
ȯ \U022F LATIN SMALL LETTER O WITH DOT ABOVE
|
||||||
|
ȳ \U0233 LATIN SMALL LETTER Y WITH MACRON
|
||||||
|
IPA Extensions
|
||||||
|
ɑ \U0251 LATIN SMALL LETTER SCRIPT A
|
||||||
|
ɒ \U0252 LATIN SMALL LETTER TURNED SCRIPT A
|
||||||
|
ɔ \U0254 LATIN SMALL LETTER OPEN O
|
||||||
|
ə \U0259 LATIN SMALL LETTER SCHWA
|
||||||
|
ɜ \U025C LATIN SMALL LETTER REVERSED OPEN E
|
||||||
|
ɥ \U0265 LATIN LETTER SMALL LETTER TURNED H
|
||||||
|
ɪ \U026A LATIN LETTER SMALL CAPITAL I
|
||||||
|
ɲ \U0272 LATIN SMALL LETTER N WITH LEFT HOOK
|
||||||
|
ʃ \U0283 LATIN SMALL LETTER ESH
|
||||||
|
ʉ \U0289 LATIN SMALL LETTER U BAR
|
||||||
|
ʊ \U028A LATIN SMALL LETTER UPSILON
|
||||||
|
ʌ \U028C LATIN SMALL LETTER TURNED V
|
||||||
|
ʏ \U028F LATIN LETTER SMALL CAPITAL Y
|
||||||
|
ʒ \U0292 LATIN SMALL LETTER EZH
|
||||||
|
ʔ \U0294 LATIN LETTER GLOTTAL STOP
|
||||||
|
ʜ \U029C LATIN LETTER SMALL CAPITAL H
|
||||||
|
Spacing Modifier Letters
|
||||||
|
ʾ \U02BE MODIFIER LETTER RIGHT HALF RING
|
||||||
|
ʿ \U02BF MODIFIER LETTER LEFT HALF RING
|
||||||
|
ˇ ˇ \U02C7 CARON
|
||||||
|
ˈ \U02C8 MODIFIER LETTER VERTICAL LINE
|
||||||
|
ˌ \U02CC MODIFIER LETTER LOW VERTICAL LINE
|
||||||
|
ː \U02D0 MODIFIER LETTER TRIANGULAR COLON
|
||||||
|
˘ ˘ \U02D8 BREVE
|
||||||
|
˙ ˙ \U02D9 DOT ABOVE
|
||||||
|
Greek and Coptic
|
||||||
|
Α Α \U0391 GREEK CAPTIAL LETTER ALPHA
|
||||||
|
Β Β \U0392 GREEK CAPTIAL LETTER BETA
|
||||||
|
Γ Γ \U0393 GREEK CAPTIAL LETTER GAMMA
|
||||||
|
Δ Ε \U0394 GREEK CAPTIAL LETTER DELTA
|
||||||
|
Ε Ε \U0395 GREEK CAPTIAL LETTER EPSILON
|
||||||
|
Ζ Ζ \U0396 GREEK CAPTIAL LETTER ZETA
|
||||||
|
Η Η \U0397 GREEK CAPTIAL LETTER ETA
|
||||||
|
Θ Θ \U0398 GREEK CAPTIAL LETTER THETA
|
||||||
|
Ι Ι \U0399 GREEK CAPTIAL LETTER IOTA
|
||||||
|
Κ Κ \U039A GREEK CAPTIAL LETTER KAPPA
|
||||||
|
Λ Λ \U039B GREEK CAPTIAL LETTER LAMBDA
|
||||||
|
Μ Μ \U039C GREEK CAPTIAL LETTER MU
|
||||||
|
Ν Ν \U039D GREEK CAPTIAL LETTER NU
|
||||||
|
Ξ Ξ \U039E GREEK CAPTIAL LETTER XI
|
||||||
|
Ο Ο \U039F GREEK CAPTIAL LETTER OMICRON
|
||||||
|
Π Π \U03A0 GREEK CAPTIAL LETTER PI
|
||||||
|
Ρ Ρ \U03A1 GREEK CAPTIAL LETTER RHO
|
||||||
|
Σ Σ \U03A3 GREEK CAPTIAL LETTER SIGMA
|
||||||
|
Τ Τ \U03A4 GREEK CAPTIAL LETTER TAU
|
||||||
|
Υ Υ \U03A5 GREEK CAPTIAL LETTER UPSILON
|
||||||
|
Φ Φ \U03A6 GREEK CAPTIAL LETTER PHI
|
||||||
|
Χ Χ \U03A7 GREEK CAPTIAL LETTER CHI
|
||||||
|
Ψ Ψ \U03A8 GREEK CAPTIAL LETTER PSI
|
||||||
|
Ω Ω \U03A9 GREEK CAPTIAL LETTER OMEGA
|
||||||
|
α α \U03B1 GREEK SMALL LETTER ALPHA
|
||||||
|
β β \U03B2 GREEK SMALL LETTER BETA
|
||||||
|
γ γ \U03B3 GREEK SMALL LETTER GAMMA
|
||||||
|
δ δ \U03B4 GREEK SMALL LETTER DELTA
|
||||||
|
ε ε \U03B5 GREEK SMALL LETTER EPSILON
|
||||||
|
ζ ζ \U03B6 GREEK SMALL LETTER ZETA
|
||||||
|
η η \U03B7 GREEK SMALL LETTER ETA
|
||||||
|
θ θ \U03B8 GREEK SMALL LETTER THETA
|
||||||
|
ι ι \U03B9 GREEK SMALL LETTER IOTA
|
||||||
|
κ κ \U03BA GREEK SMALL LETTER KAPPA
|
||||||
|
λ λ \U03BB GREEK SMALL LETTER LAMBDA
|
||||||
|
μ μ \U03BC GREEK SMALL LETTER MU
|
||||||
|
ν ν \U03BD GREEK SMALL LETTER NU
|
||||||
|
ξ ξ \U03BE GREEK SMALL LETTER XI
|
||||||
|
ο ο \U03BF GREEK SMALL LETTER OMICRON
|
||||||
|
π π \U03C0 GREEK SMALL LETTER PI
|
||||||
|
ρ ρ \U03C1 GREEK SMALL LETTER RHO
|
||||||
|
ς ς \U03C2 GREEK SMALL LETTER FINAL SIGMA
|
||||||
|
σ σ \U03C3 GREEK SMALL LETTER SIGMA
|
||||||
|
τ τ \U03C4 GREEK SMALL LETTER TAU
|
||||||
|
υ υ \U03C5 GREEK SMALL LETTER UPSILON
|
||||||
|
φ φ \U03C6 GREEK SMALL LETTER PHI
|
||||||
|
χ χ \U03C7 GREEK SMALL LETTER CHI
|
||||||
|
ψ ψ \U03C8 GREEK SMALL LETTER PSI
|
||||||
|
ω ω \U03C9 GREEK SMALL LETTER OMEGA
|
||||||
|
ϑ \U03D1 GREEK THETA SYMBOL
|
||||||
|
ϝ \U03DD GREEK SMALL LETTER DIGAMMA
|
||||||
|
Hebrew
|
||||||
|
א א \U05D0 HEBREW LETTER ALEPH
|
||||||
|
ב ב \U05D1 HEBREW LETTER BET
|
||||||
|
ג ג \U05D2 HEBREW LETTER GIMEL
|
||||||
|
ד ד \U05D3 HEBREW LETTER DALET
|
||||||
|
ה ה \U05D4 HEBREW LETTER HE
|
||||||
|
ו ו \U05D5 HEBREW LETTER VAV
|
||||||
|
ז ז \U05D6 HEBREW LETTER ZAYIN
|
||||||
|
ח ח \U05D7 HEBREW LETTER HET
|
||||||
|
ט ט \U05D8 HEBREW LETTER TET
|
||||||
|
י י \U05D9 HEBREW LETTER YOD
|
||||||
|
ך ך \U05DA HEBREW LETTER FINAL KAF
|
||||||
|
כ כ \U05DB HEBREW LETTER KAF
|
||||||
|
ל ל \U05DC HEBREW LETTER LAMED
|
||||||
|
ם ם \U05DD HEBREW LETTER FINAL MEM
|
||||||
|
מ מ \U05DE HEBREW LETTER MEM
|
||||||
|
ן ן \U05DF HEBREW LETTER FINAL NUN
|
||||||
|
נ נ \U05E0 HEBREW LETTER NUN
|
||||||
|
ס ס \U05E1 HEBREW LETTER SAMEKH
|
||||||
|
ע ע \U05E2 HEBREW LETTER AYIN
|
||||||
|
ף ף \U05E3 HEBREW LETTER FINAL PE
|
||||||
|
פ פ \U05E4 HEBREW LETTER PE
|
||||||
|
ץ ץ \U05E5 HEBREW LETTER FINAL TSADI
|
||||||
|
צ צ \U05E6 HEBREW LETTER TSADI
|
||||||
|
ק ק \U05E7 HEBREW LETTER QOF
|
||||||
|
ר ר \U05E8 HEBREW LETTER RESH
|
||||||
|
ת ת \U05EA HEBREW LETTER TAV
|
||||||
|
Latin Extended Additional
|
||||||
|
ḋ \U1E0B LATIN SMALL LETTER D WITH DOT ABOVE
|
||||||
|
ḍ \U1E0D LATIN SMALL LETTER D WITH DOT BELOW
|
||||||
|
ḗ \U1E17 LATIN SMALL LETTER E WITH MACRON AND ACUTE
|
||||||
|
Ḣ \U1E22 LATIN CAPITAL LETTER H WITH DOT ABOVE
|
||||||
|
Ḥ \U1E24 LATIN CAPITAL LETTER H WITH DOT BELOW
|
||||||
|
ḥ \U1E25 LATIN SMALL LETTER H WITH DOT BELOW
|
||||||
|
ḫ \U1E2B LATIN SMALL LETTER H WITH BREVE BELOW
|
||||||
|
ḳ \U1E33 LATIN SMALL LETTER K WITH DOT BELOW
|
||||||
|
ḷ \U1E37 LATIN SMALL LETTER L WITH DOT BELOW
|
||||||
|
ṁ \U1E41 LATIN SMALL LETTER M WITH DOT ABOVE
|
||||||
|
ṃ \U1E43 LATIN SMALL LETTER M WITH DOT BELOW
|
||||||
|
ṅ \U1E45 LATIN SMALL LETTER N WITH DOT ABOVE
|
||||||
|
ṇ \U1E47 LATIN SMALL LETTER N WITH DOT BELOW
|
||||||
|
ṓ \U1E53 LATIN SMALL LETTER O WITH MACRON AND ACUTE
|
||||||
|
ṙ \U1E59 LATIN SMALL LETTER R WITH DOT ABOVE
|
||||||
|
Ṛ \U1E5A LATIN CAPITAL LETTER R WITH DOT BELOW
|
||||||
|
ṛ \U1E5B LATIN SMALL LETTER R WITH DOT BELOW
|
||||||
|
ṡ \U1E61 LATIN SMALL LETTER S WITH DOT ABOVE
|
||||||
|
ṣ \U1E63 LATIN SMALL LETTER S WITH DOT BELOW
|
||||||
|
ṫ \U1E6B LATIN SMALL LETTER T WITH DOT ABOVE
|
||||||
|
ṭ \U1E6D LATIN SMALL LETTER T WITH DOT BELOW
|
||||||
|
ṯ \U1E6F LATIN SMALL LETTER T WITH LINE BELOW
|
||||||
|
ẑ \U1E91 LATIN SMALL LETTER Z WITH CIRCUMFLEX
|
||||||
|
ẓ \U1E93 LATIN SMALL LETTER Z WITH DOT BELOW
|
||||||
|
ẖ \U1E96 LATIN SMALL LETTER H WITH LINE BELOW
|
||||||
|
ạ \U1EA1 LATIN SMALL LETTER A WITH DOT BELOW
|
||||||
|
ọ \U1ECD LATIN SMALL LETTER O WITH DOT BELOW
|
||||||
|
ỹ \U1EF9 LATIN SMALL LETTER Y WITH TILDE
|
||||||
|
General Punctuation
|
||||||
|
- ‑ \U2011 NON-BREAKING HYPHEN
|
||||||
|
‸ \U2038 CARET
|
||||||
|
‽ \U203D INTERROBANG
|
||||||
|
⁂ \U2042 ASTERISM
|
||||||
|
Arrows
|
||||||
|
← ← \U2190 LEFTWARDS ARROW
|
||||||
|
→ → \U2192 RIGHTWARDS ARROW
|
||||||
|
Mathematical Operators
|
||||||
|
∂ ∂ \U2202 PARTIAL DIFFERENTIAL
|
||||||
|
√ √ \U221A SQUARE ROOT
|
||||||
|
∞ ∞ \U221E INFINITY
|
||||||
|
∥ ∥ \U2225 PARALLEL TO
|
||||||
|
∫ ∫ \U222B INTEGRAL
|
||||||
|
≠ ≠ \U2260 NOT EQUAL TO
|
||||||
|
⊔ \U2294 SQUARE CUP
|
||||||
|
⊕ \U2295 CIRCLED PLUS
|
||||||
|
⋮ \U22EE VERTICAL ELLIPSIS
|
||||||
|
Enclosed Alphanumerics
|
||||||
|
Ⓤ \U24CA CIRCLED LATIN CAPITAL LETTER U
|
||||||
|
Miscellaneous Symbols
|
||||||
|
☜ ☜ \U261C WHITE LEFT POINTING INDEX
|
||||||
|
☞ ☞ \U261E WHITE RIGHT POINTING INDEX
|
||||||
|
☿ \U263F MERCURY
|
||||||
|
♀ \U2640 FEMALE SIGN
|
||||||
|
♂ \U2642 MALE SIGN
|
||||||
|
♃ \U2643 JUPITER
|
||||||
|
♄ \U2644 SATURN
|
||||||
|
♅ \U2645 URANUS
|
||||||
|
♆ \U2646 NEPTUNE
|
||||||
|
♇ \U2647 PLUTO
|
||||||
|
♠ \U2660 BLACK SPADE SUIT
|
||||||
|
♡ \U2661 WHITE HEART SUIT
|
||||||
|
♢ \U2662 WHITE DIAMOND SUIT
|
||||||
|
♣ \U2663 BLACK CLUB SUIT
|
||||||
|
♭ \U266D MUSIC FLAT SIGN
|
||||||
|
♮ \U266E MUSIC NATURAL SIGN
|
||||||
|
♯ \U266F MUSIC SHARP SIGN
|
||||||
|
Dingbats
|
||||||
|
✓ \U2713 CHECK MARK
|
||||||
|
✠ \U2720 MALTESE CROSS
|
||||||
|
Private Use Area
|
||||||
|
- \UE000 LATIN SMALL LETTER A WITH MACRON AND ACUTE
|
||||||
|
- \UE001 LATIN SMALL LETTER A WITH MACRON AND TILDE
|
||||||
|
- \UE002 LATIN SMALL LETTER A WITH VERTICAL LINE ABOVE
|
||||||
|
- \UE003 LATIN CAPITAL LETTER C WITH MACRON
|
||||||
|
- \UE004 LATIN SMALL LETTER C WITH MACRON
|
||||||
|
- \UE005 LATIN SMALL LETTER C WITH BREVE
|
||||||
|
- \UE006 LATIN SMALL LETTER C WITH DOT BELOW
|
||||||
|
- \UE007 LATIN SMALL LIGATURE CH
|
||||||
|
- \UE008 LATIN CAPITAL LETTER D WITH MACRON
|
||||||
|
- \UE009 LATIN SMALL LETTER E WITH BAR BELOW
|
||||||
|
- \UE00A LATIN SMALL LETTER E WITH TILDE
|
||||||
|
- \UE00B LATIN SMALL LETTER E WITH MACRON AND BREVE
|
||||||
|
- \UE00C LATIN SMALL LETTER E WITH TILDE AND DOT ABOVE
|
||||||
|
- \UE00D LATIN SMALL LETTER E WITH HOOK RIGHT BELOW
|
||||||
|
- \UE00E LATIN SMALL LETTER G WITH INVERTED BREVE
|
||||||
|
- \UE00F LATIN SMALL LETTER I WITH INVERTED BREVE BELOW
|
||||||
|
- \UE010 LATIN SMALL LETTER I WITH MACRON AND ACUTE
|
||||||
|
- \UE011 LATIN SMALL LETTER K WITH CIRCUMFLEX
|
||||||
|
- \UE012 LATIN SMALL LETTER K WITH BREVE
|
||||||
|
- \UE013 LATIN SMALL LETTER K WITH INVERTED BREVE
|
||||||
|
- \UE014 LATIN SMALL LIGATURE KH
|
||||||
|
- \UE015 LATIN CAPITAL LETTER L WITH MACRON
|
||||||
|
- \UE016 LATIN SMALL LETTER L WITH TILDE
|
||||||
|
- \UE017 LATIN SMALL LETTER L WITH INVERTED BREVE
|
||||||
|
- \UE018 LATIN CAPITAL LETTER M WITH MACRON
|
||||||
|
- \UE019 LATIN SMALL LETTER M WITH MACRON
|
||||||
|
- \UE01A LATIN SMALL LETTER M WITH TILDE
|
||||||
|
- \UE01B LATIN SMALL LETTER O WITH CEDILLA
|
||||||
|
- \UE01C LATIN SMALL LETTER O WITH MACRON AND CIRUMFLEX
|
||||||
|
- \UE01E LATIN SMALL LIGATURE OI
|
||||||
|
- \UE01F LATIN SMALL LIGATURE OO
|
||||||
|
- \UE020 LATIN SMALL LIGATURE OO WITH MACRON
|
||||||
|
- \UE021 LATIN SMALL LIGATURE OU
|
||||||
|
- \UE022 LATIN SMALL LETTER OPEN O WITH ACUTE
|
||||||
|
- \UE023 LATIN SMALL LETTER R WITH DIARESIS
|
||||||
|
- \UE024 LATIN SMALL LETTER R WITH CIRCUMFLEX
|
||||||
|
- \UE025 LATIN SMALL LETTER R WITH RING BELOW
|
||||||
|
- \UE026 LATIN SMALL LETTER S WITH VERTICAL LINE ABOVE
|
||||||
|
- \UE027 LATIN SMALL LETTER S WITH OGONEK
|
||||||
|
- \UE028 LATIN SMALL LETTER S WITH COMMA
|
||||||
|
- \UE02A LATIN SMALL LETTER S WITH BREVE
|
||||||
|
- \UE02B LATIN SMALL LIGATURE SH
|
||||||
|
- \UE02C LATIN SMALL LIGATURE TH
|
||||||
|
- \UE02D LATIN SMALL LETTER U WITH MACRON AND ACUTE
|
||||||
|
- \UE02E LATIN CAPITAL LETTER V WITH MACRON
|
||||||
|
- \UE02F LATIN CAPITAL LETTER X WITH MACRON
|
||||||
|
- \UE030 LATIN SMALL LETTER X WITH CIRCUMFLEX
|
||||||
|
- \UE031 LATIN SMALL LETTER Y WITH BREVE
|
||||||
|
- \UE032 LATIN SMALL LIGATURE ZH
|
||||||
|
- \UE033 LATIN SMALL LETTER TURNED E WITH ACUTE
|
||||||
|
- \UE034 LATIN SMALL LETTER TURNED E WITH CIRCUMFLEX
|
||||||
|
- \UE035 GREEK SMALL LETTER ALPHA WITH GRAVE
|
||||||
|
- \UE036 MUSICAL SYMBOL SEGNO
|
||||||
|
- \UE037 MUSICAL SYMBOL FERMATA
|
||||||
|
- \UE038 MUSICAL SYMBOL CRESCENDO
|
||||||
|
- \UE039 MUSICAL SYMBOL DECRESCENDO
|
||||||
|
- \UE03A MUSICAL SYMBOL DOUBLE SHARP
|
||||||
|
- \UE03B MUSICAL SYMBOL BREVE
|
||||||
|
- \UE03C MUSICAL SYMBOL DOWN BOW
|
||||||
|
- \UE03D MUSICAL SYMBOL UP BOW
|
||||||
|
- \UE03E MUSICAL SYMBOL BREVE ALTERNATE
|
||||||
|
- \UE03F PRINTING SYMBOL DELE
|
||||||
|
- \UE040 PRINTING SYMBOL FRACTIONAL EM
|
||||||
|
- \UE041 INVERTED ASTERISM
|
||||||
|
- \UE042 LATIN SMALL LETTER SCHWA SUPERSCRIPT
|
||||||
|
- \UE043 LATIN SMALL LETTER TURNED Y
|
||||||
|
- \UE044 LATIN SMALL LIGATURE OE WITH MACRON
|
||||||
|
- \UE045 SQUARE ROOT WITH BAR
|
||||||
|
- \UE046 LATIN SMALL LETTER U WITH DOT ABOVE
|
||||||
|
- \UE047 LATIN SMALL LIGATURE UE
|
||||||
|
- \UE048 LATIN SMALL LIGATURE UE WITH MACRON
|
||||||
|
- \UE049 LATIN SMALL LETTER OPEN O WITH TILDE
|
||||||
|
- \UE04A LATIN SMALL LETTER T WITH CARON BELOW
|
||||||
|
- \UE04B LATIN SMALL LETTER SCRIPT A WITH TILDE
|
||||||
|
- \UE04C GREEK SMALL LETTER EPSILON WITH TILDE
|
||||||
|
- \UE04D LATIN SMALL LIGATURE OE WITH TILDE
|
||||||
|
- \UE04E MODIFIER LETTER DOUBLE VERTICAL LINE
|
||||||
|
- \UE04F DOUBLE HYPHEN
|
||||||
|
- \UE050 LATIN SMALL LETTER SCHWA WITH DOT ABOVE
|
||||||
|
- \UE051 LATIN SMALL LETTER SCHWA WITH MACRON
|
||||||
|
Alphabetic Presentation Forms
|
||||||
|
fl fl \UFB02 LATIN SMALL LIGATURE FL
|
||||||
|
שׁ שׁ \UFB2A HEBREW LETTER SINH WITH SHIN DOT
|
||||||
|
שׂ שׂ \UFB2B HEBREW LETTER SINH WITH SIN DOT
|
||||||
|
|
226
format_docs/pdb/ztxt.txt
Normal file
226
format_docs/pdb/ztxt.txt
Normal file
@ -0,0 +1,226 @@
|
|||||||
|
The zTXT Format
|
||||||
|
---------------
|
||||||
|
|
||||||
|
The zTXT format is relatively straightforward. The simplest zTXT contains a
|
||||||
|
Palm database header, followed by zTXT record #0, followed by the compressed
|
||||||
|
data. The compressed data can be in one of two formats: one long data stream,
|
||||||
|
or split into chunks for random access. If there are any bookmarks, they occupy
|
||||||
|
the record immediately after the compressed data. If there are any annotations,
|
||||||
|
the annotation index occupies the record immediately after the bookmarks with
|
||||||
|
each annotation in the index having a record immediately after the annotation
|
||||||
|
index. Here are diagrams of a simple zTXT and a full featured zTXT:
|
||||||
|
|
||||||
|
DB Header
|
||||||
|
0 Record 0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
... Compressed Data
|
||||||
|
36
|
||||||
|
37
|
||||||
|
38
|
||||||
|
|
||||||
|
DB Header
|
||||||
|
0 Record 0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
... Compressed Data
|
||||||
|
36
|
||||||
|
37
|
||||||
|
38
|
||||||
|
39 Bookmarks
|
||||||
|
40 Annotation Index
|
||||||
|
41 Annotation 1
|
||||||
|
42 Annotation 2
|
||||||
|
43 Annotation 3
|
||||||
|
|
||||||
|
|
||||||
|
Compression Modes
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
zTXT version 1.40 and later supports two modes of compression. Mode 1 is a
|
||||||
|
random access mode, and mode 2 consists of one long data stream. Both modes
|
||||||
|
work on 8K (the default record size) blocks of text.
|
||||||
|
|
||||||
|
Please note, however, that as of Weasel Reader version 1.60 the old style
|
||||||
|
(mode 2) zTXT format is no longer supported. makeztxt and libztxt still support
|
||||||
|
creating these documents for backwards compatibility, but you should not use
|
||||||
|
mode 2 if possible.
|
||||||
|
|
||||||
|
|
||||||
|
Mode 1
|
||||||
|
------
|
||||||
|
|
||||||
|
In mode one, 8K blocks of text are compressed into an equal number of blocks of
|
||||||
|
compressed data. Using the Z_FULL_FLUSH flush mode with zLib allows for random
|
||||||
|
access among the blocks of data. In order for this to function, the first block
|
||||||
|
must be decompressed first, and after that any block in the file may be
|
||||||
|
decompressed in any order. In mode 1, the blocks of compressed data will likely
|
||||||
|
not all have the same size.
|
||||||
|
|
||||||
|
|
||||||
|
Mode 2
|
||||||
|
------
|
||||||
|
|
||||||
|
In zTXT versions before 1.40, this was the only method of compression. This
|
||||||
|
mode involves compressing the entire input buffer into a single output buffer
|
||||||
|
and then splitting the resulting buffer into 8K segments. This mode requires
|
||||||
|
that all of the compressed data be decompressed in one pass. Since there are no
|
||||||
|
real 'blocks' of data, the resulting output can be of any blocksize, though
|
||||||
|
typically the default of 8K should be fine. The advantage to mode 2 is that it
|
||||||
|
will give about 10% - 15% more compression.
|
||||||
|
|
||||||
|
|
||||||
|
zTXT Record #0 Definition (version 1.44)
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
Record 0 provides all of the information about the zTXT contents. Be sure it is
|
||||||
|
correct, lest firey death rain down upon your program.
|
||||||
|
|
||||||
|
typedef struct zTXT_record0Type {
|
||||||
|
UInt16 version;
|
||||||
|
UInt16 numRecords;
|
||||||
|
UInt32 size;
|
||||||
|
UInt16 recordSize;
|
||||||
|
UInt16 numBookmarks;
|
||||||
|
UInt16 bookmarkRecord;
|
||||||
|
UInt16 numAnnotations;
|
||||||
|
UInt16 annotationRecord;
|
||||||
|
UInt8 flags;
|
||||||
|
UInt8 reserved;
|
||||||
|
UInt32 crc32;
|
||||||
|
UInt8 padding[0x20 - 24];
|
||||||
|
} zTXT_record0;
|
||||||
|
|
||||||
|
|
||||||
|
Structure Elements
|
||||||
|
------------------
|
||||||
|
|
||||||
|
UInt16 version;
|
||||||
|
|
||||||
|
This is mostly just informational. Your program can figure out what features
|
||||||
|
might be available from the version. However, the remaining parts of the
|
||||||
|
structure are designed such that their value will be 0 if that particular
|
||||||
|
feature is not present, so that is the correct way to test. The version is
|
||||||
|
stored as two 8 bit integers. For example, version 1.42 is 0x012A.
|
||||||
|
|
||||||
|
UInt16 numRecords;
|
||||||
|
|
||||||
|
This is the number of DATA records only and does not include record 0,
|
||||||
|
bookmarks, or annotations. With compression mode 1, this is also the number of
|
||||||
|
uncompressed text records. With mode 2, you must decompress the file to figure
|
||||||
|
out how many text records there will be.
|
||||||
|
|
||||||
|
UInt32 size;
|
||||||
|
|
||||||
|
The size in bytes of the uncompressed data in the zTXT. Check this value with
|
||||||
|
the amount of free storage memory on the Palm to make sure there's enough room
|
||||||
|
to decompress the data in full or in part.
|
||||||
|
|
||||||
|
UInt16 recordSize;
|
||||||
|
|
||||||
|
recordSize is the size in bytes of a text record. This field is important, as
|
||||||
|
the size of text and decompression buffers is based on this value. It is used
|
||||||
|
by Weasel to navigate though the text so it can map absolute offsets to record
|
||||||
|
numberss. 8192 is the default. With compression mode 1, this is the amount of
|
||||||
|
data inside each compressed record (except maybe the last one), but the actual
|
||||||
|
compressed records will likely have varying sizes. In mode 2, both compressed
|
||||||
|
records and the resulting text records are all of this size (except, again, the
|
||||||
|
last record).
|
||||||
|
|
||||||
|
UInt16 numBookmarks;
|
||||||
|
|
||||||
|
The definitive count of how many bookmarks are stored in the bookmark index
|
||||||
|
record. See the section on bookmarks below.
|
||||||
|
|
||||||
|
UInt16 bookmarkRecord;
|
||||||
|
|
||||||
|
If there are any bookmarks, this is set to the record index number that
|
||||||
|
contains the bookmark listing, otherwise it is 0.
|
||||||
|
|
||||||
|
UInt16 numAnnotations;
|
||||||
|
|
||||||
|
Like the bookmark count, this is the definitive count of how many annotations
|
||||||
|
are in the annotation index and how many annotation records follow it. See the
|
||||||
|
section on annotation below.
|
||||||
|
|
||||||
|
UInt16 annotationRecord;
|
||||||
|
|
||||||
|
If there are any annotations, this is set to the record index number that
|
||||||
|
contains the annotation index, otherwise it is 0.
|
||||||
|
|
||||||
|
UInt8 flags;
|
||||||
|
|
||||||
|
These flags indicate various features of the zTXT database. flags is a bitmask
|
||||||
|
and at present the only two defined bits are:
|
||||||
|
|
||||||
|
ZTXT_RANDOMACCESS (0x01)
|
||||||
|
If the zTXT was compressed according to the method in mode 1, then it
|
||||||
|
supports random access and this should be set.
|
||||||
|
ZTXT_NONUNIFORM (0x02)
|
||||||
|
Setting this bit indicates that the text records within the zTXT database
|
||||||
|
are not of uniform length. That is, when the blocks of text are
|
||||||
|
decompressed they will not have identical block sizes. If this is not set,
|
||||||
|
the compressed blocks are assumed to all have the same size when
|
||||||
|
decompressed (typically 8K) except for the last block which can be smaller.
|
||||||
|
|
||||||
|
UInt32 crc32;
|
||||||
|
|
||||||
|
A CRC32 value for checking data integrity. This value is computer over all text
|
||||||
|
data record only and does not include record 0 nor any bookmark/annotation
|
||||||
|
records. The current implementation in makeztxt/Weasel computes this value
|
||||||
|
using the crc32 function in zLib which should be the standard CRC32 definition.
|
||||||
|
|
||||||
|
UInt8 padding[0x20 - 24];
|
||||||
|
|
||||||
|
zTXT record zero is 32 bytes in length, so the unused portion is padded.
|
||||||
|
|
||||||
|
|
||||||
|
zTXT Bookmarks
|
||||||
|
--------------
|
||||||
|
|
||||||
|
zTXT bookmarks are stored in a simple array in a record at the end of a zTXT.
|
||||||
|
The format is as follows:
|
||||||
|
|
||||||
|
#define MAX_BMRK_LENGTH 20
|
||||||
|
|
||||||
|
typedef struct GPlmMarkType {
|
||||||
|
UInt32 offset;
|
||||||
|
Char title[MAX_BMRK_LENGTH];
|
||||||
|
} GPlmMark;
|
||||||
|
|
||||||
|
In the structure, offset is counted as an absolute offset into the text. The
|
||||||
|
bookmarks must be sorted in ascending order.
|
||||||
|
|
||||||
|
If there are no bookmarks, then the bookmark index does not exist. When the
|
||||||
|
user creates the first bookmark, the record containing the index will then be
|
||||||
|
created. If there are annotations, when the bookmark record is created it must
|
||||||
|
go before the annotation index. This will require incrementing annotationRecord
|
||||||
|
in record 0 to point to the new record index.
|
||||||
|
|
||||||
|
Similarly, when all bookmarks are deleted the bookmark index record is also
|
||||||
|
deleted. If there are annotations, annotationRecord in record 0 must be
|
||||||
|
decremented to point to the new index.
|
||||||
|
|
||||||
|
|
||||||
|
zTXT Annotations
|
||||||
|
----------------
|
||||||
|
|
||||||
|
zTXT annotations have a format almost identical to that of the bookmark index:
|
||||||
|
|
||||||
|
typedef struct GPlmAnnotationType {
|
||||||
|
UInt32 offset;
|
||||||
|
Char title[MAX_BMRK_LENGTH];
|
||||||
|
} GPlmAnnotation;
|
||||||
|
|
||||||
|
Like the bookmarks, offset is an absolute offset into the text. The annotation
|
||||||
|
index is organized just as the bookmarks are, as a single array in a record.
|
||||||
|
Note that this structure does NOT store the actual annotation text.
|
||||||
|
|
||||||
|
The text of each annotation is stored in its own record immediately following
|
||||||
|
the index. So, the first annotation in the index will occupy the first record
|
||||||
|
following the index, and the second annotation will be in the second record
|
||||||
|
following the index, and so on. The text of each annotation is limited to
|
||||||
|
4096 bytes.
|
||||||
|
|
303
format_docs/rb.txt
Normal file
303
format_docs/rb.txt
Normal file
@ -0,0 +1,303 @@
|
|||||||
|
Rocket eBook File Format
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
from http://rbmake.sourceforge.net/rb_format.html
|
||||||
|
|
||||||
|
|
||||||
|
Overview
|
||||||
|
--------
|
||||||
|
|
||||||
|
This document attempts to describe the format of a .rb file -- the book
|
||||||
|
format that is downloaded into NuvoMedia's <http://www.nuvomedia.com>
|
||||||
|
hand-held wonder, the Rocket eBook
|
||||||
|
<http://www.rocket-ebook.com/enter.html>.
|
||||||
|
|
||||||
|
*Note:* All multi-byte integers are stored in Vax/Intel order (the
|
||||||
|
opposite of network byte order). Most integers are 4 bytes (an int32),
|
||||||
|
but there are some minor exceptions (as detailed below).
|
||||||
|
|
||||||
|
Also, the following document refers to the .rb file sections as "pages".
|
||||||
|
|
||||||
|
|
||||||
|
Details
|
||||||
|
-------
|
||||||
|
|
||||||
|
The first 4 bytes of the file seem to be a magic number (in hex): B0 0C
|
||||||
|
B0 0C. I like to think of this as a hexidecimal pun on the word "book"
|
||||||
|
(repeated). [Matt Greenwood has reported seeing a magic number of "B0 0C
|
||||||
|
F0 0D" in another type of ReB-related file -- i.e. "book food".]
|
||||||
|
|
||||||
|
The next two bytes appear to be a version number, currently "02 00". I
|
||||||
|
assume this means major version 2, minor version 0.
|
||||||
|
|
||||||
|
The next 4 bytes are the string "NUVO", followed by 4 bytes of 00h. (I
|
||||||
|
have also seen an old title that had 0s in place of the "NUVO".)
|
||||||
|
|
||||||
|
This brings us up to offset 0Eh, at which point we have a 4-byte
|
||||||
|
representation of the date the book was created (Matt Greenwood pointed
|
||||||
|
this out to me -- thanks!). The year is encoded as an int16. On older
|
||||||
|
version of the RocketLibrary was encoding the year's full value (e.g.
|
||||||
|
1999 was "CF 07" and 2000 was "D0 07"), but a more recent version is now
|
||||||
|
using the tm_year value verbatim -- i.e. it's storing 100 for the year
|
||||||
|
2000 ("64 00"). The year is followed by an int8 for the 1-relative month
|
||||||
|
number, and an int8 for the day of the month.
|
||||||
|
|
||||||
|
After that is 6 bytes of 00h. These may be reserved for setting the time
|
||||||
|
of creation (at a guess).
|
||||||
|
|
||||||
|
Then, at offset 18h, we have an int32 that contains the absolute offset
|
||||||
|
of the "Table of Contents" (the directory of the pages contained within
|
||||||
|
this .rb file). In all of the .rb file's I've seen, this remains
|
||||||
|
constant with a value of 128h. However, I have tested an atypical .rb
|
||||||
|
file where I placed the ToC at the end of the file (after all the file
|
||||||
|
contents), and it worked fine. (I've chosen not to build any books in
|
||||||
|
such a non-standard format, however.)
|
||||||
|
|
||||||
|
Immediately following this is an int32 with the length of the .rb file
|
||||||
|
(so we can check if the file is complete or not).
|
||||||
|
|
||||||
|
All the bytes from here (offset 20h) up to offset 128h appear to only be
|
||||||
|
used by an encrypted title. In a non-encrypted title, they are always 0.
|
||||||
|
|
||||||
|
The table of contents typically comes next (at offset 128h). It starts
|
||||||
|
with an int32 count of the number of "page" entries (.rb-file sections)
|
||||||
|
in the ToC. Each entry consists of a name (zero-padded to 32 bytes),
|
||||||
|
followed by 3 int32s: the length of this entry's data segment, the
|
||||||
|
absolute offset of the data in the .rb file, and a flag. The known flag
|
||||||
|
values are: 1 (encrypted), 2 (info page), and 8 (deflated). The names
|
||||||
|
are tweaked as needed to ensure that they are all unique. The current
|
||||||
|
RocketWriter software uses a unique 6-digit number, a dash, up to 8
|
||||||
|
characters from the filename, and then the re-mapped suffix for the data
|
||||||
|
(.html, .hidx, .png, .info, etc.). My rbmake library simply ensures that
|
||||||
|
the names are no longer than 15 characters (not counting the suffix) and
|
||||||
|
are all unique.
|
||||||
|
|
||||||
|
Often the first item in the ToC is the info page, but it doesn't have to
|
||||||
|
be. This page of information contains NAME=VALUE pairs that note the
|
||||||
|
author, title, what the root-page's name is, etc. (See appendix A). This
|
||||||
|
data is never encrypted nor compressed, so this entry's flag value is
|
||||||
|
always "2".
|
||||||
|
|
||||||
|
An image page is always stored as a B&W image in PNG format. Since it
|
||||||
|
has its own compression, it is stored without any additional attempt at
|
||||||
|
deflation. I have also never seen an encrypted image, so its flag value
|
||||||
|
is always 0.
|
||||||
|
|
||||||
|
An HTML page contains the tags and text that were re-written into a
|
||||||
|
consistent syntax (this presumably makes the HTML renderer in the ReB
|
||||||
|
itself simpler). HTML pages are typically compressed (See appendix B).
|
||||||
|
Every HTML page appears to use the suffix .html no matter what the file
|
||||||
|
name was on import (but I have seen older files with .htm used as the
|
||||||
|
suffix, so the rocket appears to support both).
|
||||||
|
|
||||||
|
For every HTML page there is a corresponding .hidx page that contains a
|
||||||
|
summary of the paragraph formatting and the position of the anchor names
|
||||||
|
in the associated .html page (See appendix C). This page is sometimes
|
||||||
|
compressed, depending on length (See appendix B).
|
||||||
|
|
||||||
|
There are also reference titles that have a .hkey page that contains a
|
||||||
|
list of words that can be looked up in the associated .html page (See
|
||||||
|
appendix D).
|
||||||
|
|
||||||
|
Immediately following the ToC is the data for each piece mentioned in
|
||||||
|
the ToC, in the same order as it appeared in the ToC.
|
||||||
|
|
||||||
|
Finally, the end of the file appears to be padded with 20 bytes of 01h.
|
||||||
|
|
||||||
|
|
||||||
|
Appendix A: Info Page Format
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
The info page consists of a series of lines that contain "NAME=VALUE"
|
||||||
|
strings. Each line is terminated by a single newline. Here are the
|
||||||
|
values that the RocketWriter generates:
|
||||||
|
|
||||||
|
COMMENT=Info file for <title>
|
||||||
|
TYPE=2
|
||||||
|
TITLE=<title>
|
||||||
|
AUTHOR=<author>
|
||||||
|
URL=ebook:<long, unique string used for the file's name by the librarian>
|
||||||
|
GENERATOR=<e.g. RocketLibrarian 1.3.216>
|
||||||
|
PARSE=1
|
||||||
|
OUTPUT=1
|
||||||
|
BODY=<name of root HTML page (as it appears in the ToC)>
|
||||||
|
MENUMARK=menumark.html
|
||||||
|
SuggestedRetailPrice=<usually empty>
|
||||||
|
|
||||||
|
Encrypted titles have a few more entries (including those listed above):
|
||||||
|
|
||||||
|
ISBN=<ISBN number, including dashes>
|
||||||
|
REVISION=<digits>
|
||||||
|
TITLE_LANGUAGE=<en-us>
|
||||||
|
PUB_NAME=<Publisher's name>
|
||||||
|
PUBSERVER_ID=<digits>
|
||||||
|
GENERATOR=<e.g. RocketPress 1.3.121>
|
||||||
|
VERSION=<digits>
|
||||||
|
USERNAME=<rocket-ID>
|
||||||
|
COPY_ID=<digits>
|
||||||
|
COPYRIGHT=<copyright>
|
||||||
|
COPYTITLE=<another copyright?>
|
||||||
|
|
||||||
|
A reference title also has an indication that there is a .hkey page
|
||||||
|
present, and may also have a GENRE of "Reference":
|
||||||
|
|
||||||
|
HKEY=1
|
||||||
|
GENRE=Reference
|
||||||
|
|
||||||
|
|
||||||
|
Appendix B: The format of compressed data
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
|
Compressed pages have a data section in the .rb file with the following
|
||||||
|
format:
|
||||||
|
|
||||||
|
The first int32 is a count of the number of 4096-byte chunks of data we
|
||||||
|
broke the uncompressed page into (the last chunk can be shorter than
|
||||||
|
4096 bytes, of course).
|
||||||
|
|
||||||
|
This is immediately followed by an int32 with the length of the entire
|
||||||
|
uncompressed data.
|
||||||
|
|
||||||
|
After this there are <count> int32s that indicate the size of each
|
||||||
|
chunk's compressed data.
|
||||||
|
|
||||||
|
Following these length int32s is the output from a deflation (the
|
||||||
|
algorithm used in gzip) for each 4096-byte chunk of the original data.
|
||||||
|
It appears that you must use a window-bit size of 13 and a compression
|
||||||
|
level of "best" to be compatible with the Rocket eBook's system software.
|
||||||
|
|
||||||
|
|
||||||
|
Appendix C: HTML-index Page Format
|
||||||
|
----------------------------------
|
||||||
|
|
||||||
|
The .hidx page's purpose is to allow the renderer to quickly look up the
|
||||||
|
format of each paragraph (useful for random access to the data), and the
|
||||||
|
position of the anchor names.
|
||||||
|
|
||||||
|
The first section lists the various paragraph-producing tags. It is
|
||||||
|
headed by a line of "[tags <count>]", where <count> is the number of
|
||||||
|
tags that follow this header. The tags are listed one per line, and have
|
||||||
|
an implied enumeration from 0 to N-1 (which the other tags and the
|
||||||
|
upcoming paragraph sections reference).
|
||||||
|
|
||||||
|
The first tag is typically (always?) "<HTML> -1". The number trailing
|
||||||
|
the tag indicates what other tag (or sequence of tags, one per line) in
|
||||||
|
which we are nested. So, if we have a <BR> nested inside a <P
|
||||||
|
ALIGN="center">, it would be listed separately from a <BR> that was
|
||||||
|
nested inside a normal paragraph, and each one would have a different
|
||||||
|
trailing index number.
|
||||||
|
|
||||||
|
Following the tag section is the paragraph section. The heading is
|
||||||
|
"[paragraphs <count>]", and is followed by a line for each paragraph.
|
||||||
|
These lines consist of a character offset into the .html page for the
|
||||||
|
start of the paragraph followed by a 0-relative offset into the tag
|
||||||
|
section (indicating what kind of formatting to use for the indicated
|
||||||
|
paragraph).
|
||||||
|
|
||||||
|
The paragraph-section character offsets point to the first bit of text
|
||||||
|
after the associated tag.
|
||||||
|
|
||||||
|
The last section details the anchor names. The heading is
|
||||||
|
"[names <count>]", and each item that follows is a quoted string of the
|
||||||
|
anchor name, followed by a character offset into the .html page where
|
||||||
|
we'll find that name. If there are no names in the associated HTML
|
||||||
|
section, the heading is included with a 0 count (i.e. "[names 0]").
|
||||||
|
|
||||||
|
The name-section character offsets point to the start of the anchor tag
|
||||||
|
(not after the tag, like the offsets in the "paragraphs" section).
|
||||||
|
|
||||||
|
The lines are terminated by newlines (in standard unix fashion).
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
[tags 10]
|
||||||
|
<HTML> -1
|
||||||
|
<BODY> 0
|
||||||
|
<P ALIGN="right"> 1
|
||||||
|
<P ALIGN="left"> 1
|
||||||
|
<P> 1
|
||||||
|
<H3 ALIGN="center"> 1
|
||||||
|
<P ALIGN="center"> 1
|
||||||
|
<BR> 6
|
||||||
|
<H2 ALIGN="center"> 1
|
||||||
|
<BR> 1
|
||||||
|
|
||||||
|
[paragraphs 42]
|
||||||
|
160 9
|
||||||
|
164 9
|
||||||
|
184 8
|
||||||
|
220 8
|
||||||
|
261 6
|
||||||
|
316 5
|
||||||
|
359 1
|
||||||
|
379 6
|
||||||
|
410 6
|
||||||
|
460 7
|
||||||
|
511 7
|
||||||
|
564 7
|
||||||
|
616 7
|
||||||
|
668 7
|
||||||
|
720 7
|
||||||
|
773 7
|
||||||
|
827 7
|
||||||
|
880 7
|
||||||
|
933 7
|
||||||
|
988 7
|
||||||
|
1043 7
|
||||||
|
1100 7
|
||||||
|
1157 7
|
||||||
|
1214 7
|
||||||
|
1270 7
|
||||||
|
1328 7
|
||||||
|
1385 7
|
||||||
|
1442 7
|
||||||
|
1497 7
|
||||||
|
1556 7
|
||||||
|
1561 7
|
||||||
|
1635 1
|
||||||
|
1656 5
|
||||||
|
1690 6
|
||||||
|
1737 7
|
||||||
|
1773 5
|
||||||
|
1798 4
|
||||||
|
1826 3
|
||||||
|
2663 1
|
||||||
|
2668 4
|
||||||
|
2689 2
|
||||||
|
2730 8
|
||||||
|
|
||||||
|
[names 1]
|
||||||
|
"ch1" 2689
|
||||||
|
|
||||||
|
|
||||||
|
Appendix D: HTML-key Page Format
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
The .hkey page contains a list of words, one per line, sorted in a
|
||||||
|
strict ASCII sequence, each one followed by a tab and the offset in the
|
||||||
|
.html page of the word's data. I presume that the .hkey page must share
|
||||||
|
the same name prefix as its related .html page.
|
||||||
|
|
||||||
|
If the names contain high-bit characters, they are translated into
|
||||||
|
regular ASCII in the .hkey file, since this allows the user to search
|
||||||
|
for the words using unaccented characters.
|
||||||
|
|
||||||
|
The lines are terminated with a newline (in standard unix fashion).
|
||||||
|
|
||||||
|
An example:
|
||||||
|
|
||||||
|
a 5
|
||||||
|
apple 38
|
||||||
|
b 84
|
||||||
|
book 104
|
||||||
|
|
||||||
|
Each of these offsets points to a paragraph tag in the associated .html
|
||||||
|
page. I have only seen this sequence of tags used so far:
|
||||||
|
|
||||||
|
<P><BIG><B>word</B></BIG> other stuff</P>
|
||||||
|
|
||||||
|
I have seen multiple <B>...</B> tags in the middle of the single set of
|
||||||
|
<BIG>...</BIG> tags, but this is the basic tag format.
|
||||||
|
|
||||||
|
The offset in the .hkey page points to the start of the <P> tag.
|
||||||
|
|
56
format_docs/tcr.txt
Normal file
56
format_docs/tcr.txt
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
About
|
||||||
|
-----
|
||||||
|
|
||||||
|
Text compression format that can be decompressed starting at any point.
|
||||||
|
Little-endian byte ordering is used.
|
||||||
|
|
||||||
|
|
||||||
|
Header
|
||||||
|
------
|
||||||
|
|
||||||
|
TCR files always start with:
|
||||||
|
|
||||||
|
!!8-Bit!!
|
||||||
|
|
||||||
|
|
||||||
|
Layout
|
||||||
|
------
|
||||||
|
|
||||||
|
Header
|
||||||
|
256 key dictionary
|
||||||
|
compressed text
|
||||||
|
|
||||||
|
|
||||||
|
Dictionary
|
||||||
|
----------
|
||||||
|
|
||||||
|
A dictionary of key and replacement string. There are a total of 256 keys,
|
||||||
|
0 - 255. Each string is preceded with one byte that represents the length of
|
||||||
|
the string.
|
||||||
|
|
||||||
|
|
||||||
|
Compressed text
|
||||||
|
---------------
|
||||||
|
|
||||||
|
The compressed text is a series of values 0-255 which correspond to a key and
|
||||||
|
thus a string. Reassembling is replacing each key in the compressed text with
|
||||||
|
its corresponding string.
|
||||||
|
|
||||||
|
|
||||||
|
Compressor
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
From Andrew Giddings TCR.c (http://www.cix.co.uk/~gidds/Software/TCR.html):
|
||||||
|
|
||||||
|
The TCR compression format is easy to describe: after the fixed header is a
|
||||||
|
dictionary of 256 strings, each preceded by a length byte. The rest of the
|
||||||
|
file is a list of codes from this dictionary.
|
||||||
|
|
||||||
|
The compressor works by starting with each code defined as itself. While
|
||||||
|
there's an unused code, it finds the most common two-code combination, and
|
||||||
|
creates a new code for it, replacing all occurrences in the text with the
|
||||||
|
new code.
|
||||||
|
|
||||||
|
It also searches for codes that are always followed by another, which it can
|
||||||
|
merge, possibly freeing up some.
|
||||||
|
|
BIN
resources/images/news/latimes.png
Normal file
BIN
resources/images/news/latimes.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 358 B |
@ -585,7 +585,6 @@ application/vnd.osa.netdeploy
|
|||||||
application/vnd.osgi.bundle
|
application/vnd.osgi.bundle
|
||||||
application/vnd.osgi.dp dp
|
application/vnd.osgi.dp dp
|
||||||
application/vnd.otps.ct-kip+xml
|
application/vnd.otps.ct-kip+xml
|
||||||
application/vnd.palm oprc pdb pqa
|
|
||||||
application/vnd.paos.xml
|
application/vnd.paos.xml
|
||||||
application/vnd.pg.format str
|
application/vnd.pg.format str
|
||||||
application/vnd.pg.osasli ei6
|
application/vnd.pg.osasli ei6
|
||||||
@ -1082,7 +1081,6 @@ chemical/x-ncbi-asn1 asn
|
|||||||
chemical/x-ncbi-asn1-ascii ent prt
|
chemical/x-ncbi-asn1-ascii ent prt
|
||||||
chemical/x-ncbi-asn1-binary aso val
|
chemical/x-ncbi-asn1-binary aso val
|
||||||
chemical/x-ncbi-asn1-spec asn
|
chemical/x-ncbi-asn1-spec asn
|
||||||
chemical/x-pdb ent pdb
|
|
||||||
chemical/x-rosdal ros
|
chemical/x-rosdal ros
|
||||||
chemical/x-swissprot sw
|
chemical/x-swissprot sw
|
||||||
chemical/x-vamas-iso14976 vms
|
chemical/x-vamas-iso14976 vms
|
||||||
@ -1379,3 +1377,5 @@ application/x-cbr cbr
|
|||||||
application/x-cb7 cb7
|
application/x-cb7 cb7
|
||||||
application/x-koboreader-ebook kobo
|
application/x-koboreader-ebook kobo
|
||||||
image/wmf wmf
|
image/wmf wmf
|
||||||
|
application/ereader pdb
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ class Noticias(BasicNewsRecipe):
|
|||||||
title = '180.com.uy'
|
title = '180.com.uy'
|
||||||
__author__ = 'Gustavo Azambuja'
|
__author__ = 'Gustavo Azambuja'
|
||||||
description = 'Noticias de Uruguay'
|
description = 'Noticias de Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 5
|
recursion = 5
|
||||||
|
@ -1,25 +1,25 @@
|
|||||||
# -*- coding: utf-8
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'Luis Hernandez'
|
__author__ = 'Luis Hernandez'
|
||||||
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||||
description = 'Periódico gratuito en español - v0.8 - 27 Jan 2011'
|
__version__ = 'v0.85'
|
||||||
|
__date__ = '31 January 2011'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
www.20minutos.es
|
www.20minutos.es
|
||||||
'''
|
'''
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'20 Minutos'
|
title = u'20 Minutos new'
|
||||||
publisher = u'Grupo 20 Minutos'
|
publisher = u'Grupo 20 Minutos'
|
||||||
|
|
||||||
__author__ = 'Luis Hernández'
|
__author__ = 'Luis Hernandez'
|
||||||
description = 'Periódico gratuito en español'
|
description = 'Free spanish newspaper'
|
||||||
cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
|
cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
|
||||||
|
|
||||||
oldest_article = 5
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
@ -29,6 +29,7 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
encoding = 'ISO-8859-1'
|
encoding = 'ISO-8859-1'
|
||||||
language = 'es'
|
language = 'es'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'id':['content','vinetas',]})
|
dict(name='div', attrs={'id':['content','vinetas',]})
|
||||||
@ -43,13 +44,21 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='ol', attrs={'class':['navigation',]})
|
dict(name='ol', attrs={'class':['navigation',]})
|
||||||
,dict(name='span', attrs={'class':['action']})
|
,dict(name='span', attrs={'class':['action']})
|
||||||
,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
|
,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','photo-gallery side-art-block','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
|
||||||
,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']})
|
,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']})
|
||||||
,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']})
|
,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']})
|
||||||
,dict(name='ul', attrs={'id':['site-links']})
|
,dict(name='ul', attrs={'id':['site-links']})
|
||||||
,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']})
|
,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
p{text-align: justify; font-size: 100%}
|
||||||
|
body{ text-align: left; font-size:100% }
|
||||||
|
h3{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(r'<a href="http://estaticos.*?[0-999]px;" target="_blank">', re.DOTALL), lambda m: '')]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Portada' , u'http://www.20minutos.es/rss/')
|
(u'Portada' , u'http://www.20minutos.es/rss/')
|
||||||
,(u'Nacional' , u'http://www.20minutos.es/rss/nacional/')
|
,(u'Nacional' , u'http://www.20minutos.es/rss/nacional/')
|
||||||
@ -65,6 +74,6 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
,(u'Empleo' , u'http://www.20minutos.es/rss/empleo/')
|
,(u'Empleo' , u'http://www.20minutos.es/rss/empleo/')
|
||||||
,(u'Cine' , u'http://www.20minutos.es/rss/cine/')
|
,(u'Cine' , u'http://www.20minutos.es/rss/cine/')
|
||||||
,(u'Musica' , u'http://www.20minutos.es/rss/musica/')
|
,(u'Musica' , u'http://www.20minutos.es/rss/musica/')
|
||||||
,(u'Vinetas' , u'http://www.20minutos.es/rss/vinetas/')
|
,(u'Vinetas' , u'http://www.20minutos.es/rss/vinetas/')
|
||||||
,(u'Comunidad20' , u'http://www.20minutos.es/rss/zona20/')
|
,(u'Comunidad20' , u'http://www.20minutos.es/rss/zona20/')
|
||||||
]
|
]
|
||||||
|
@ -20,7 +20,7 @@ class SieteDias(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
lang = 'es-AR'
|
lang = 'es-AR'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
@ -58,4 +58,4 @@ class Ambito(BasicNewsRecipe):
|
|||||||
del item['style']
|
del item['style']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
@ -12,7 +12,7 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
|
|||||||
masthead_url = 'http://www.animalpolitico.com/wp-content/themes/animal_mu/images/logo.png'
|
masthead_url = 'http://www.animalpolitico.com/wp-content/themes/animal_mu/images/logo.png'
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
language = 'es'
|
language = 'es_MX'
|
||||||
|
|
||||||
#feeds = [(u'Animal Politico', u'http://www.animalpolitico.com/feed/')]
|
#feeds = [(u'Animal Politico', u'http://www.animalpolitico.com/feed/')]
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ class Axxon_news(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = False
|
no_stylesheets = False
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
publication_type = 'magazine'
|
publication_type = 'magazine'
|
||||||
INDEX = 'http://axxon.com.ar/rev/'
|
INDEX = 'http://axxon.com.ar/rev/'
|
||||||
|
@ -18,7 +18,7 @@ class Axxon_news(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = False
|
no_stylesheets = False
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
lang = 'es-AR'
|
lang = 'es-AR'
|
||||||
|
|
||||||
|
53
resources/recipes/bbc_es.recipe
Normal file
53
resources/recipes/bbc_es.recipe
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Luis Hernandez'
|
||||||
|
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||||
|
__version__ = 'v1.0'
|
||||||
|
__date__ = '29 January 2011'
|
||||||
|
|
||||||
|
'''
|
||||||
|
http://www.bbc.co.uk/mundo/
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'BBC Mundo'
|
||||||
|
publisher = u'BBC'
|
||||||
|
|
||||||
|
__author__ = 'Luis Hernandez'
|
||||||
|
description = 'BBC World for spanish readers'
|
||||||
|
|
||||||
|
cover_url = 'http://1.bp.blogspot.com/_NHiOjk_uZwU/TEYy7IJAdAI/AAAAAAAABP8/coAE-pJ7_5E/s1600/bbcmundo_h.png'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
language = 'es'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
encoding = 'UTF-8'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='div' , attrs={'class':['g-group']})
|
||||||
|
remove_tags_after = dict(name='div' , attrs={'class':[' g-w8']})
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='ul', attrs={'class':['document-tools blq-clearfix','blq-clearfix']})
|
||||||
|
,dict(name='div', attrs={'class':['box bx-quote-bubble','socialmedia-links','list li-carousel','list li-plain rolling-news','list li-plain','box bx-livestats','li-tab content','list li-relatedlinks','list li-relatedinternetlinks']})
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Portada' , u'http://www.bbc.co.uk/mundo/index.xml')
|
||||||
|
,(u'Ultimas Noticias' , u'http://www.bbc.co.uk/mundo/ultimas_noticias/index.xml')
|
||||||
|
,(u'Internacional' , u'http://www.bbc.co.uk/mundo/temas/internacional/index.xml')
|
||||||
|
,(u'Economia' , u'http://www.bbc.co.uk/mundo/temas/economia/index.xml')
|
||||||
|
,(u'America Latina' , u'http://www.bbc.co.uk/mundo/temas/america_latina/index.xml')
|
||||||
|
,(u'Ciencia' , u'http://www.bbc.co.uk/mundo/temas/ciencia/index.xml')
|
||||||
|
,(u'Salud' , u'http://www.bbc.co.uk/mundo/temas/salud/index.xml')
|
||||||
|
,(u'Tecnologia' , u'http://www.bbc.co.uk/mundo/temas/tecnologia/index.xml')
|
||||||
|
,(u'Cultura' , u'http://www.bbc.co.uk/mundo/temas/cultura/index.xml')
|
||||||
|
]
|
||||||
|
|
@ -12,7 +12,7 @@ class General(BasicNewsRecipe):
|
|||||||
title = 'bitacora.com.uy'
|
title = 'bitacora.com.uy'
|
||||||
__author__ = 'Gustavo Azambuja'
|
__author__ = 'Gustavo Azambuja'
|
||||||
description = 'Noticias de Uruguay'
|
description = 'Noticias de Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 5
|
recursion = 5
|
||||||
|
@ -20,7 +20,7 @@ class BsAsEconomico(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
lang = 'es-AR'
|
lang = 'es-AR'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
71
resources/recipes/cinco_dias.recipe
Normal file
71
resources/recipes/cinco_dias.recipe
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Luis Hernandez'
|
||||||
|
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||||
|
__version__ = 'v1.2'
|
||||||
|
__date__ = '31 January 2011'
|
||||||
|
|
||||||
|
'''
|
||||||
|
http://www.cincodias.com/
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Cinco Dias'
|
||||||
|
publisher = u'Grupo Prisa'
|
||||||
|
|
||||||
|
__author__ = 'Luis Hernandez'
|
||||||
|
description = 'spanish web about money and bussiness, free edition'
|
||||||
|
|
||||||
|
cover_url = 'http://www.prisa.com/images/logos/logo_cinco_dias.gif'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
language = 'es'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
encoding = 'ISO-8859-1'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':['cab_articulo cab_noticia','pos_3','txt_noticia','mod_despiece']})
|
||||||
|
,dict(name='p', attrs={'class':['cintillo']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before = dict(name='div' , attrs={'class':['publi_h']})
|
||||||
|
remove_tags_after = dict(name='div' , attrs={'class':['tab_util util_estadisticas']})
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['util-1','util-2','util-3','inner estirar','inner1','inner2','inner3','cont','tab_util util_estadisticas','tab_util util_enviar','mod_list_inf','mod_similares','mod_divisas','mod_sectores','mod_termometro','mod post','mod_img','mod_txt','nivel estirar','barra estirar','info_brujula btnBrujula','utilidad_brujula estirar']})
|
||||||
|
,dict(name='li', attrs={'class':['lnk-fcbook','lnk-retweet','lnk-meneame','desplegable','comentarios','list-options','estirar']})
|
||||||
|
,dict(name='ul', attrs={'class':['lista-izquierda','list-options','estirar']})
|
||||||
|
,dict(name='p', attrs={'class':['autor']})
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
p{text-align: justify; font-size: 100%}
|
||||||
|
body{ text-align: left; font-size:100% }
|
||||||
|
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
|
||||||
|
h3{font-family: sans-serif; font-size:100%; font-style: italic; text-align: justify; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Ultima Hora' , u'http://www.cincodias.com/rss/feed.html?feedId=17029')
|
||||||
|
,(u'Empresas' , u'http://www.cincodias.com/rss/feed.html?feedId=19')
|
||||||
|
,(u'Mercados' , u'http://www.cincodias.com/rss/feed.html?feedId=20')
|
||||||
|
,(u'Economia' , u'http://www.cincodias.com/rss/feed.html?feedId=21')
|
||||||
|
,(u'Tecnorama' , u'http://www.cincodias.com/rss/feed.html?feedId=17230')
|
||||||
|
,(u'Tecnologia' , u'http://www.cincodias.com/rss/feed.html?feedId=17106')
|
||||||
|
,(u'Finanzas Personales' , u'http://www.cincodias.com/rss/feed.html?feedId=22')
|
||||||
|
,(u'Fiscalidad' , u'http://www.cincodias.com/rss/feed.html?feedId=17107')
|
||||||
|
,(u'Vivienda' , u'http://www.cincodias.com/rss/feed.html?feedId=17108')
|
||||||
|
,(u'Tendencias' , u'http://www.cincodias.com/rss/feed.html?feedId=17109')
|
||||||
|
,(u'Empleo' , u'http://www.cincodias.com/rss/feed.html?feedId=17110')
|
||||||
|
,(u'IBEX 35' , u'http://www.cincodias.com/rss/feed.html?feedId=17125')
|
||||||
|
,(u'Sectores' , u'http://www.cincodias.com/rss/feed.html?feedId=17126')
|
||||||
|
,(u'Opinion' , u'http://www.cincodias.com/rss/feed.html?feedId=17105')
|
||||||
|
]
|
@ -18,7 +18,7 @@ class Clarin(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
INDEX = 'http://www.clarin.com'
|
INDEX = 'http://www.clarin.com'
|
||||||
masthead_url = 'http://www.clarin.com/static/CLAClarin/images/logo-clarin-print.jpg'
|
masthead_url = 'http://www.clarin.com/static/CLAClarin/images/logo-clarin-print.jpg'
|
||||||
|
@ -14,7 +14,7 @@ class CriticaDigital(BasicNewsRecipe):
|
|||||||
description = 'Noticias de Argentina'
|
description = 'Noticias de Argentina'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
@ -11,7 +11,7 @@ class CubaDebate(BasicNewsRecipe):
|
|||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Contra el Terorismo Mediatico'
|
description = 'Contra el Terorismo Mediatico'
|
||||||
oldest_article = 15
|
oldest_article = 15
|
||||||
language = 'es'
|
language = 'es_CU'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
@ -16,7 +16,7 @@ class DeutscheWelle_es(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'es'
|
language = 'de_ES'
|
||||||
publication_type = 'newsportal'
|
publication_type = 'newsportal'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||||
|
@ -20,7 +20,7 @@ class Diagonales(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
lang = 'es-AR'
|
lang = 'es-AR'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
@ -20,7 +20,7 @@ class ElMercurio(BasicNewsRecipe):
|
|||||||
masthead_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif'
|
masthead_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_CL'
|
||||||
|
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
|
@ -13,7 +13,7 @@ class ObservaDigital(BasicNewsRecipe):
|
|||||||
title = 'Observa Digital'
|
title = 'Observa Digital'
|
||||||
__author__ = 'yrvn'
|
__author__ = 'yrvn'
|
||||||
description = 'Noticias de Uruguay'
|
description = 'Noticias de Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 5
|
recursion = 5
|
||||||
|
@ -14,7 +14,7 @@ class General(BasicNewsRecipe):
|
|||||||
description = 'Noticias de Uruguay y el resto del mundo'
|
description = 'Noticias de Uruguay y el resto del mundo'
|
||||||
publisher = 'EL PAIS S.A.'
|
publisher = 'EL PAIS S.A.'
|
||||||
category = 'news, politics, Uruguay'
|
category = 'news, politics, Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 2
|
recursion = 2
|
||||||
|
@ -20,7 +20,7 @@ class ElUniversal(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
language = 'es'
|
language = 'es_MX'
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
body{font-family:Arial,Helvetica,sans-serif}
|
body{font-family:Arial,Helvetica,sans-serif}
|
||||||
|
@ -20,7 +20,7 @@ class ElArgentino(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png'
|
cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
|
|
||||||
html2lrf_options = [
|
html2lrf_options = [
|
||||||
|
@ -18,7 +18,7 @@ class ElComercio(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
use_embedded_content = True
|
use_embedded_content = True
|
||||||
language = 'es'
|
language = 'es_EC'
|
||||||
masthead_url = 'http://ww1.elcomercio.com/nv_images/headers/EC/logo_new_08.gif'
|
masthead_url = 'http://ww1.elcomercio.com/nv_images/headers/EC/logo_new_08.gif'
|
||||||
extra_css = ' body{font-family: Arial,Verdana,sans-serif} img{margin-bottom: 1em} '
|
extra_css = ' body{font-family: Arial,Verdana,sans-serif} img{margin-bottom: 1em} '
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ class ElCronista(BasicNewsRecipe):
|
|||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Noticias de Argentina'
|
description = 'Noticias de Argentina'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
@ -21,7 +21,7 @@ class ElTiempoHn(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_HN'
|
||||||
|
|
||||||
lang = 'es-HN'
|
lang = 'es-HN'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
@ -18,7 +18,7 @@ class ElUniversal(BasicNewsRecipe):
|
|||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
publisher = 'El Universal'
|
publisher = 'El Universal'
|
||||||
category = 'news, Caracas, Venezuela, world'
|
category = 'news, Caracas, Venezuela, world'
|
||||||
language = 'es'
|
language = 'es_VE'
|
||||||
cover_url = strftime('http://static.eluniversal.com/%Y/%m/%d/portada.jpg')
|
cover_url = strftime('http://static.eluniversal.com/%Y/%m/%d/portada.jpg')
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
|
@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class ElUniversalImpresaRecipe(BasicNewsRecipe):
|
class ElUniversalImpresaRecipe(BasicNewsRecipe):
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'kwetal'
|
__author__ = 'kwetal'
|
||||||
language = 'es'
|
language = 'es_MX'
|
||||||
version = 1
|
version = 1
|
||||||
|
|
||||||
title = u'El Universal (Edici\u00F3n Impresa)'
|
title = u'El Universal (Edici\u00F3n Impresa)'
|
||||||
|
@ -17,7 +17,7 @@ class ElUniverso_Ecuador(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_EC'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url = 'http://servicios2.eluniverso.com/versiones/v1/img/Hd/lg_ElUniverso.gif'
|
masthead_url = 'http://servicios2.eluniverso.com/versiones/v1/img/Hd/lg_ElUniverso.gif'
|
||||||
|
@ -18,3 +18,6 @@ class EndgadgetJapan(BasicNewsRecipe):
|
|||||||
language = 'ja'
|
language = 'ja'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')]
|
feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')]
|
||||||
|
|
||||||
|
remove_tags_before = dict(name="div", attrs={'id':"content_wrap"})
|
||||||
|
remove_tags_after = dict(name='h3', attrs={'id':'addcomments'})
|
||||||
|
54
resources/recipes/explosm.recipe
Normal file
54
resources/recipes/explosm.recipe
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
|
class Explosm(BasicNewsRecipe):
|
||||||
|
title = u'Explosm Rotated'
|
||||||
|
__author__ = 'Andromeda Rabbit'
|
||||||
|
description = 'Explosm'
|
||||||
|
language = 'en'
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
oldest_article = 24
|
||||||
|
remove_javascript = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Explosm Feed', u'http://feeds.feedburner.com/Explosm')
|
||||||
|
]
|
||||||
|
|
||||||
|
#match_regexps = [r'http://www.explosm.net/comics/.*']
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='img', attrs={'alt':'Cyanide and Happiness, a daily webcomic'})]
|
||||||
|
remove_tags = [dict(name='div'), dict(name='span'), dict(name='table'), dict(name='br'), dict(name='nobr'), dict(name='a'), dict(name='b')]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||||
|
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||||
|
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||||
|
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}'''
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://cdn.shopify.com/s/files/1/0059/1872/products/cyanidetitle_large.jpg?1295846286'
|
||||||
|
|
||||||
|
def parse_feeds(self):
|
||||||
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
|
|
||||||
|
for curfeed in feeds:
|
||||||
|
delList = []
|
||||||
|
for a,curarticle in enumerate(curfeed.articles):
|
||||||
|
if re.search(r'http://www.explosm.net/comics', curarticle.url) == None:
|
||||||
|
delList.append(curarticle)
|
||||||
|
if len(delList)>0:
|
||||||
|
for d in delList:
|
||||||
|
index = curfeed.articles.index(d)
|
||||||
|
curfeed.articles[index:index+1] = []
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def skip_ad_pages(self, soup):
|
||||||
|
# Skip ad pages served before actual article
|
||||||
|
skip_tag = soup.find(name='img', attrs={'alt':'Cyanide and Happiness, a daily webcomic'})
|
||||||
|
if skip_tag is None:
|
||||||
|
return soup
|
||||||
|
return None
|
@ -12,7 +12,7 @@ class General(BasicNewsRecipe):
|
|||||||
title = 'freeway.com.uy'
|
title = 'freeway.com.uy'
|
||||||
__author__ = 'Gustavo Azambuja'
|
__author__ = 'Gustavo Azambuja'
|
||||||
description = 'Revista Freeway, Montevideo, Uruguay'
|
description = 'Revista Freeway, Montevideo, Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 1
|
recursion = 1
|
||||||
|
@ -20,7 +20,7 @@ class Granma(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
cover_url = 'http://www.granma.cubaweb.cu/imagenes/granweb229d.jpg'
|
cover_url = 'http://www.granma.cubaweb.cu/imagenes/granweb229d.jpg'
|
||||||
language = 'es'
|
language = 'es_CU'
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ class iEco(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
publisher = 'Grupo Clarin'
|
publisher = 'Grupo Clarin'
|
||||||
category = 'news, economia, mercados, bolsa de valores, finanzas, empresas, negocios, empleos, emprendedores, marketinguniversidades, tecnologia, agronegocios, noticias, informacion'
|
category = 'news, economia, mercados, bolsa de valores, finanzas, empresas, negocios, empleos, emprendedores, marketinguniversidades, tecnologia, agronegocios, noticias, informacion'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
cover_url = 'http://www.ieco.clarin.com/static2/images/Tapa-PDF.gif'
|
cover_url = 'http://www.ieco.clarin.com/static2/images/Tapa-PDF.gif'
|
||||||
extra_css = ' #bd{font-family: sans-serif} '
|
extra_css = ' #bd{font-family: sans-serif} '
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ class Infobae(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
|
masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
@ -20,7 +20,7 @@ class Juventudrebelde(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
language = 'es'
|
language = 'es_CU'
|
||||||
|
|
||||||
cover_url = strftime('http://www.juventudrebelde.cu/UserFiles/File/impreso/iportada-%Y-%m-%d.jpg')
|
cover_url = strftime('http://www.juventudrebelde.cu/UserFiles/File/impreso/iportada-%Y-%m-%d.jpg')
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
@ -50,4 +50,4 @@ class LaCuarta(BasicNewsRecipe):
|
|||||||
feeds = [(u'Noticias', u'http://lacuarta.cl/app/rss?sc=TEFDVUFSVEE=')]
|
feeds = [(u'Noticias', u'http://lacuarta.cl/app/rss?sc=TEFDVUFSVEE=')]
|
||||||
|
|
||||||
|
|
||||||
language = 'es'
|
language = 'es_CL'
|
||||||
|
@ -12,7 +12,7 @@ class General(BasicNewsRecipe):
|
|||||||
title = 'La Diaria'
|
title = 'La Diaria'
|
||||||
__author__ = 'Gustavo Azambuja'
|
__author__ = 'Gustavo Azambuja'
|
||||||
description = 'Noticias de Uruguay'
|
description = 'Noticias de Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 5
|
recursion = 5
|
||||||
|
@ -19,7 +19,7 @@ class LaJornada_mx(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_MX'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
cover_url = strftime("http://www.jornada.unam.mx/%Y/%m/%d/portada.pdf")
|
cover_url = strftime("http://www.jornada.unam.mx/%Y/%m/%d/portada.pdf")
|
||||||
masthead_url = 'http://www.jornada.unam.mx/v7.0/imagenes/la-jornada-trans.png'
|
masthead_url = 'http://www.jornada.unam.mx/v7.0/imagenes/la-jornada-trans.png'
|
||||||
|
@ -18,7 +18,7 @@ class LaRazon_Bol(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_BO'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
delay = 1
|
delay = 1
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
@ -19,7 +19,7 @@ class LaSegunda(BasicNewsRecipe):
|
|||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
masthead_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
|
masthead_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
language = 'es'
|
language = 'es_CL'
|
||||||
extra_css = ' .titulonegritastop{font-size: xx-large; font-weight: bold} '
|
extra_css = ' .titulonegritastop{font-size: xx-large; font-weight: bold} '
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
|
@ -2,24 +2,23 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'Luis Hernandez'
|
__author__ = 'Luis Hernandez'
|
||||||
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
|
||||||
description = 'Diario local de Talavera de la Reina - v1.2 - 27 Jan 2011'
|
__version__ = 'v1.0'
|
||||||
|
__date__ = '01 Feb 2011'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
http://www.latribunadetalavera.es/
|
http://www.promecal.es/
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'La Tribuna de Talavera'
|
title = u'La Tribuna de'
|
||||||
publisher = u'Grupo PROMECAL'
|
publisher = u'Grupo PROMECAL'
|
||||||
|
|
||||||
__author__ = 'Luis Hernández'
|
__author__ = 'Luis Hernández'
|
||||||
description = 'Diario local de Talavera de la Reina'
|
description = 'Varios diarios locales del grupo PROMECAL'
|
||||||
cover_url = 'http://www.latribunadetalavera.es/entorno/mancheta.gif'
|
|
||||||
|
|
||||||
oldest_article = 5
|
oldest_article = 3
|
||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 50
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
@ -27,7 +26,7 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_ES'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
@ -39,7 +38,20 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
remove_tags_before = dict(name='div' , attrs={'class':['comparte']})
|
remove_tags_before = dict(name='div' , attrs={'class':['comparte']})
|
||||||
remove_tags_after = dict(name='div' , attrs={'id':['relacionadas']})
|
remove_tags_after = dict(name='div' , attrs={'id':['relacionadas']})
|
||||||
|
|
||||||
extra_css = ' p{text-align: justify; font-size: 100%} body{ text-align: left; font-family: serif; font-size: 100% } h1{ font-family: sans-serif; font-size:150%; font-weight: 700; text-align: justify; } h2{ font-family: sans-serif; font-size:120%; font-weight: 600; text-align: justify } h3{ font-family: sans-serif; font-size:60%; font-weight: 600; text-align: left } h4{ font-family: sans-serif; font-size:80%; font-weight: 600; text-align: left } h5{ font-family: sans-serif; font-size:70%; font-weight: 600; text-align: left }img{margin-bottom: 0.4em} '
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':['relacionadas']})
|
||||||
|
,dict(name='h3')
|
||||||
|
,dict(name='h5')
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
p{text-align: justify; font-size: 100%}
|
||||||
|
body{text-align: left; font-family: serif; font-size: 100%}
|
||||||
|
h1{font-family: sans; font-size:150%; font-weight: bold; text-align: justify;}
|
||||||
|
h2{font-family: sans-serif; font-size:85%; font-style: italic; text-align: justify;}
|
||||||
|
h4{font-family: sans; font-size:75%; font-weight: bold; text-align: center;}
|
||||||
|
img{margin-bottom: 0.4em}
|
||||||
|
"""
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for alink in soup.findAll('a'):
|
for alink in soup.findAll('a'):
|
||||||
@ -48,4 +60,15 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
|
|||||||
alink.replaceWith(tstr)
|
alink.replaceWith(tstr)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
feeds = [(u'Portada', u'http://www.latribunadetalavera.es/rss.html')]
|
|
||||||
|
feeds = [
|
||||||
|
(u'Albacete', u'http://www.latribunadealbacete.es/rss.html')
|
||||||
|
,(u'Avila', u'http://www.diariodeavila.es/rss.html')
|
||||||
|
,(u'Burgos', u'http://www.diariodeburgos.es/rss.html')
|
||||||
|
,(u'Ciudad Real', u'http://www.latribunadeciudadreal.es/rss.html')
|
||||||
|
,(u'Palencia', u'http://www.diariopalentino.es/rss.html')
|
||||||
|
,(u'Puertollano', u'http://www.latribunadepuertollano.es/rss.html')
|
||||||
|
,(u'Talavera de la Reina', u'http://www.latribunadetalavera.es/rss.html')
|
||||||
|
,(u'Toledo', u'http://www.latribunadetoledo.es/rss.html')
|
||||||
|
,(u'Valladolid', u'http://www.eldiadevalladolid.com/rss.html')
|
||||||
|
]
|
||||||
|
@ -19,7 +19,7 @@ class LaMujerDeMiVida(BasicNewsRecipe):
|
|||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
publisher = 'La Mujer de mi Vida'
|
publisher = 'La Mujer de mi Vida'
|
||||||
category = 'literatura, critica, arte, ensayos'
|
category = 'literatura, critica, arte, ensayos'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
INDEX = 'http://www.lamujerdemivida.com.ar/'
|
INDEX = 'http://www.lamujerdemivida.com.ar/'
|
||||||
html2lrf_options = [
|
html2lrf_options = [
|
||||||
|
@ -16,7 +16,7 @@ class Lanacion(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
masthead_url = 'http://www.lanacion.com.ar/imgs/layout/logos/ln341x47.gif'
|
masthead_url = 'http://www.lanacion.com.ar/imgs/layout/logos/ln341x47.gif'
|
||||||
|
@ -51,4 +51,4 @@ class LaNacionChile(BasicNewsRecipe):
|
|||||||
del item['style']
|
del item['style']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
language = 'es'
|
language = 'es_CL'
|
||||||
|
@ -21,7 +21,7 @@ class LaPrensa(BasicNewsRecipe):
|
|||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
# cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif'
|
# cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
lang = 'es'
|
lang = 'es'
|
||||||
|
|
||||||
html2lrf_options = [
|
html2lrf_options = [
|
||||||
|
@ -21,7 +21,7 @@ class LaPrensaHn(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_HN'
|
||||||
|
|
||||||
lang = 'es-HN'
|
lang = 'es-HN'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
@ -22,7 +22,7 @@ class LaPrensa_ni(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
language = 'es'
|
language = 'es_NI'
|
||||||
|
|
||||||
months_es = ['enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre']
|
months_es = ['enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre']
|
||||||
current_month = months_es[datetime.date.today().month - 1]
|
current_month = months_es[datetime.date.today().month - 1]
|
||||||
|
@ -1,73 +1,92 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
latimes.com
|
www.latimes.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class LATimes(BasicNewsRecipe):
|
class LATimes(BasicNewsRecipe):
|
||||||
title = u'The Los Angeles Times'
|
title = 'Los Angeles Times'
|
||||||
__author__ = u'Darko Miletic and Sujata Raman'
|
__author__ = 'Darko Miletic'
|
||||||
description = u'News from Los Angeles'
|
description = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California'
|
||||||
oldest_article = 7
|
publisher = 'Tribune Company'
|
||||||
max_articles_per_feed = 100
|
category = 'news, politics, USA, Los Angeles, world'
|
||||||
language = 'en'
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
language = 'en'
|
||||||
lang = 'en-US'
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
masthead_url = 'http://www.latimes.com/images/logo.png'
|
||||||
|
cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Georgia,"Times New Roman",Times,serif }
|
||||||
|
img{margin-bottom: 0.4em; margin-top: 0.8em; display:block}
|
||||||
|
h2{font-size: 1.1em}
|
||||||
|
.deckhead{font-size: small; text-transform: uppercase}
|
||||||
|
.small{color: gray; font-size: small}
|
||||||
|
.date,.time,.copyright{font-size: x-small; color:gray; font-style:italic;}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
, 'language' : lang
|
, 'tags' : category
|
||||||
}
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
, 'linearize_tables' : 'Yes'
|
||||||
|
}
|
||||||
|
|
||||||
extra_css = '''
|
keep_only_tags = [
|
||||||
h1{font-family :Georgia,"Times New Roman",Times,serif; font-size:large; }
|
dict(name='div', attrs={'class':'story'})
|
||||||
h2{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
|
,dict(attrs={'class':['entry-header','time','entry-content']})
|
||||||
.story{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
|
]
|
||||||
.entry-body{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
|
remove_tags_after=dict(name='p', attrs={'class':'copyright'})
|
||||||
.entry-more{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
|
remove_tags = [
|
||||||
.credit{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
|
dict(name=['meta','link','iframe','object','embed'])
|
||||||
.small{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
|
,dict(attrs={'class':['toolSet','articlerail','googleAd','entry-footer-left','entry-footer-right','entry-footer-social','google-ad-story-bottom','sphereTools']})
|
||||||
.byline{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
|
,dict(attrs={'id':['article-promo','googleads','moduleArticleToolsContainer','gallery-subcontent']})
|
||||||
.date{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
|
]
|
||||||
.time{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
|
remove_attributes=['lang','xmlns:fb','xmlns:og','border','xtags','i','article_body']
|
||||||
.copyright{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; }
|
|
||||||
.subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
# recursions = 1
|
|
||||||
# match_regexps = [r'http://www.latimes.com/.*page=[2-9]']
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':["story" ,"entry"] })]
|
|
||||||
|
|
||||||
|
|
||||||
remove_tags = [ dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate","entry-footer-left","entry-footer-right"]}),
|
feeds = [
|
||||||
dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}),
|
(u'Top News' , u'http://feeds.latimes.com/latimes/news' )
|
||||||
dict(name='p', attrs={'class':["entry-footer",]}),
|
,(u'Local News' , u'http://feeds.latimes.com/latimes/news/local' )
|
||||||
dict(name='ul', attrs={'class':"article-nav clearfix"}),
|
,(u'National' , u'http://feeds.latimes.com/latimes/news/nationworld/nation' )
|
||||||
dict(name=['iframe'])
|
,(u'National Politics' , u'http://feeds.latimes.com/latimes/news/politics/' )
|
||||||
]
|
,(u'Business' , u'http://feeds.latimes.com/latimes/business' )
|
||||||
|
,(u'Education' , u'http://feeds.latimes.com/latimes/news/education' )
|
||||||
|
,(u'Environment' , u'http://feeds.latimes.com/latimes/news/science/environment' )
|
||||||
feeds = [(u'News', u'http://feeds.latimes.com/latimes/news')
|
,(u'Religion' , u'http://feeds.latimes.com/latimes/features/religion' )
|
||||||
,(u'Local','http://feeds.latimes.com/latimes/news/local')
|
,(u'Science' , u'http://feeds.latimes.com/latimes/news/science' )
|
||||||
,(u'MostEmailed','http://feeds.latimes.com/MostEmailed')
|
,(u'Technology' , u'http://feeds.latimes.com/latimes/technology' )
|
||||||
,(u'Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/')
|
,(u'Africa' , u'http://feeds.latimes.com/latimes/africa' )
|
||||||
,('OrangeCounty','http://feeds.latimes.com/latimes/news/local/orange/')
|
,(u'Asia' , u'http://feeds.latimes.com/latimes/asia' )
|
||||||
,('National','http://feeds.latimes.com/latimes/news/nationworld/nation')
|
,(u'Europe' , u'http://feeds.latimes.com/latimes/europe' )
|
||||||
,('Politics','http://feeds.latimes.com/latimes/news/politics/')
|
,(u'Latin America' , u'http://feeds.latimes.com/latimes/latinamerica' )
|
||||||
,('Business','http://feeds.latimes.com/latimes/business')
|
,(u'Middle East' , u'http://feeds.latimes.com/latimes/middleeast' )
|
||||||
,('Sports','http://feeds.latimes.com/latimes/sports/')
|
,(u'Arts&Culture' , u'http://feeds.feedburner.com/latimes/entertainment/news/arts' )
|
||||||
,('Entertainment','http://feeds.latimes.com/latimes/entertainment/')
|
,(u'Entertainment News' , u'http://feeds.feedburner.com/latimes/entertainment/news/' )
|
||||||
]
|
,(u'Movie News' , u'http://feeds.feedburner.com/latimes/entertainment/news/movies/' )
|
||||||
|
,(u'Movie Reviews' , u'http://feeds.feedburner.com/movies/reviews/' )
|
||||||
|
,(u'Music News' , u'http://feeds.feedburner.com/latimes/entertainment/news/music/' )
|
||||||
|
,(u'Pop Album Reviews' , u'http://feeds.feedburner.com/latimes/pop-album-reviews' )
|
||||||
|
,(u'Restaurant Reviews' , u'http://feeds.feedburner.com/latimes/restaurant/reviews' )
|
||||||
|
,(u'Theatar and Dance' , u'http://feeds.feedburner.com/latimes/theaterdance' )
|
||||||
|
,(u'Autos' , u'http://feeds.latimes.com/latimes/classified/automotive/highway1/')
|
||||||
|
,(u'Books' , u'http://feeds.latimes.com/features/books' )
|
||||||
|
,(u'Food' , u'http://feeds.latimes.com/latimes/features/food/' )
|
||||||
|
,(u'Health' , u'http://feeds.latimes.com/latimes/features/health/' )
|
||||||
|
,(u'Real Estate' , u'http://feeds.latimes.com/latimes/classified/realestate/' )
|
||||||
|
,(u'Commentary' , u'http://feeds2.feedburner.com/latimes/news/opinion/commentary/' )
|
||||||
|
,(u'Sports' , u'http://feeds.latimes.com/latimes/sports/' )
|
||||||
|
]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
ans = article.get('feedburner_origlink').rpartition('?')[0]
|
ans = BasicNewsRecipe.get_article_url(self, article).rpartition('?')[0]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.log('Looking for full story link in', ans)
|
self.log('Looking for full story link in', ans)
|
||||||
@ -83,4 +102,22 @@ class LATimes(BasicNewsRecipe):
|
|||||||
pass
|
pass
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll('img'):
|
||||||
|
if not item.has_key('alt'):
|
||||||
|
item['alt'] = 'image'
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name ='div'
|
||||||
|
item.attrs =[]
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
@ -21,7 +21,7 @@ class LaTribuna(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_HN'
|
||||||
|
|
||||||
lang = 'es-HN'
|
lang = 'es-HN'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
@ -15,12 +15,26 @@ class LeTemps(BasicNewsRecipe):
|
|||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
__author__ = 'Sujata Raman'
|
__author__ = 'Sujata Raman'
|
||||||
|
description = 'French news. Needs a subscription from http://www.letemps.ch'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
recursions = 1
|
recursions = 1
|
||||||
encoding = 'UTF-8'
|
encoding = 'UTF-8'
|
||||||
match_regexps = [r'http://www.letemps.ch/Page/Uuid/[-0-9a-f]+\|[1-9]']
|
match_regexps = [r'http://www.letemps.ch/Page/Uuid/[-0-9a-f]+\|[1-9]']
|
||||||
language = 'fr'
|
language = 'fr'
|
||||||
|
needs_subscription = True
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
|
br.open('http://www.letemps.ch/login')
|
||||||
|
br['username'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
raw = br.submit().read()
|
||||||
|
if '>Login' in raw:
|
||||||
|
raise ValueError('Failed to login to letemp.ch. Check '
|
||||||
|
'your username and password')
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'content'}),
|
keep_only_tags = [dict(name='div', attrs={'id':'content'}),
|
||||||
dict(name='div', attrs={'class':'story'})
|
dict(name='div', attrs={'class':'story'})
|
||||||
|
@ -9,6 +9,8 @@ __description__ = 'Canadian Paper '
|
|||||||
http://www.ledevoir.com/
|
http://www.ledevoir.com/
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class ledevoir(BasicNewsRecipe):
|
class ledevoir(BasicNewsRecipe):
|
||||||
@ -32,6 +34,8 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(r'(title|alt)=".*?>.*?"', re.DOTALL), lambda m: '')]
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'id':'article'}),
|
dict(name='div', attrs={'id':'article'}),
|
||||||
dict(name='ul', attrs={'id':'ariane'})
|
dict(name='ul', attrs={'id':'ariane'})
|
||||||
|
@ -18,7 +18,7 @@ class LosTiempos_Bol(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_BO'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
delay = 1
|
delay = 1
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
@ -12,7 +12,7 @@ import datetime
|
|||||||
class Milenio(BasicNewsRecipe):
|
class Milenio(BasicNewsRecipe):
|
||||||
title = u'Milenio-diario'
|
title = u'Milenio-diario'
|
||||||
__author__ = 'Bmsleight'
|
__author__ = 'Bmsleight'
|
||||||
language = 'es'
|
language = 'es_MX'
|
||||||
description = 'Milenio-diario'
|
description = 'Milenio-diario'
|
||||||
oldest_article = 10
|
oldest_article = 10
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
@ -20,7 +20,7 @@ class MiradasAlSur(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
lang = 'es-AR'
|
lang = 'es-AR'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
@ -12,7 +12,7 @@ class Noticias(BasicNewsRecipe):
|
|||||||
title = 'Montevideo COMM'
|
title = 'Montevideo COMM'
|
||||||
__author__ = 'Gustavo Azambuja'
|
__author__ = 'Gustavo Azambuja'
|
||||||
description = 'Noticias de Uruguay'
|
description = 'Noticias de Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 5
|
recursion = 5
|
||||||
|
@ -13,15 +13,12 @@ class MSNSankeiNewsProduct(BasicNewsRecipe):
|
|||||||
description = 'Products release from Japan'
|
description = 'Products release from Japan'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
encoding = 'Shift_JIS'
|
encoding = 'utf-8'
|
||||||
language = 'ja'
|
language = 'ja'
|
||||||
cover_url = 'http://sankei.jp.msn.com/images/common/sankeShinbunLogo.jpg'
|
cover_url = 'http://sankei.jp.msn.com/images/common/sankeShinbunLogo.jpg'
|
||||||
masthead_url = 'http://sankei.jp.msn.com/images/common/sankeiNewsLogo.gif'
|
masthead_url = 'http://sankei.jp.msn.com/images/common/sankeiNewsLogo.gif'
|
||||||
|
|
||||||
feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')]
|
feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')]
|
||||||
|
|
||||||
remove_tags_before = dict(id="__r_article_title__")
|
remove_tags_before = dict(id="NewsTitle")
|
||||||
remove_tags_after = dict(id="ajax_release_news")
|
remove_tags_after = dict(id="RelatedTitle")
|
||||||
remove_tags = [{'class':"parent chromeCustom6G"},
|
|
||||||
dict(id="RelatedImg")
|
|
||||||
]
|
|
||||||
|
@ -20,7 +20,7 @@ class Newsweek_Argentina(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
lang = 'es-AR'
|
lang = 'es-AR'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
@ -12,7 +12,7 @@ class Noticias(BasicNewsRecipe):
|
|||||||
title = 'Observa Digital'
|
title = 'Observa Digital'
|
||||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
description = 'Noticias desde Uruguay'
|
description = 'Noticias desde Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 5
|
recursion = 5
|
||||||
|
@ -19,7 +19,7 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif'
|
masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif'
|
||||||
|
@ -17,7 +17,7 @@ class Perfil(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
masthead_url = 'http://www.perfil.com/export/sites/diarioperfil/arte/10/logo_perfilcom_mm.gif'
|
masthead_url = 'http://www.perfil.com/export/sites/diarioperfil/arte/10/logo_perfilcom_mm.gif'
|
||||||
extra_css = """
|
extra_css = """
|
||||||
|
@ -13,7 +13,7 @@ class Reptantes(BasicNewsRecipe):
|
|||||||
description = u"cada vez que te haces acupuntura, tu muñeco vudú sufre en algún lado"
|
description = u"cada vez que te haces acupuntura, tu muñeco vudú sufre en algún lado"
|
||||||
oldest_article = 130
|
oldest_article = 130
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
@ -12,7 +12,7 @@ class Noticias(BasicNewsRecipe):
|
|||||||
title = 'Revista Bla'
|
title = 'Revista Bla'
|
||||||
__author__ = 'Gustavo Azambuja'
|
__author__ = 'Gustavo Azambuja'
|
||||||
description = 'Moda | Uruguay'
|
description = 'Moda | Uruguay'
|
||||||
language = 'es'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 5
|
recursion = 5
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
theonion.com
|
theonion.com
|
||||||
@ -15,26 +13,39 @@ class TheOnion(BasicNewsRecipe):
|
|||||||
description = "America's finest news source"
|
description = "America's finest news source"
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
publisher = u'Onion, Inc.'
|
publisher = 'Onion, Inc.'
|
||||||
category = u'humor, news, USA'
|
category = 'humor, news, USA'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
remove_javascript = True
|
publication_type = 'newsportal'
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
masthead_url = 'http://o.onionstatic.com/img/headers/onion_190.png'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Helvetica,Arial,sans-serif}
|
||||||
|
.section_title{color: gray; text-transform: uppercase}
|
||||||
|
.title{font-family: Georgia,serif}
|
||||||
|
.meta{color: gray; display: inline}
|
||||||
|
.has_caption{display: block}
|
||||||
|
.caption{font-size: x-small; color: gray; margin-bottom: 0.8em}
|
||||||
|
"""
|
||||||
|
|
||||||
html2lrf_options = [
|
conversion_options = {
|
||||||
'--comment' , description
|
'comment' : description
|
||||||
, '--category' , category
|
, 'tags' : category
|
||||||
, '--publisher' , publisher
|
, 'publisher': publisher
|
||||||
]
|
, 'language' : language
|
||||||
|
}
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
|
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h2', attrs={'class':['section_title','title']})
|
||||||
|
,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
|
||||||
|
,dict(attrs={'id':['entries']})
|
||||||
|
]
|
||||||
|
remove_attributes=['lang','rel']
|
||||||
|
remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['object','link','iframe','base'])
|
dict(name=['object','link','iframe','base','meta'])
|
||||||
,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
|
,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
|
||||||
,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
|
,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
|
||||||
]
|
]
|
||||||
@ -44,3 +55,28 @@ class TheOnion(BasicNewsRecipe):
|
|||||||
(u'Daily' , u'http://feeds.theonion.com/theonion/daily' )
|
(u'Daily' , u'http://feeds.theonion.com/theonion/daily' )
|
||||||
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
|
,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
artl = BasicNewsRecipe.get_article_url(self, article)
|
||||||
|
if artl.startswith('http://www.theonion.com/audio/'):
|
||||||
|
artl = None
|
||||||
|
return artl
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
item.attrs = []
|
||||||
|
if not limg.has_key('alt'):
|
||||||
|
limg['alt'] = 'image'
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
@ -20,7 +20,7 @@ class Veintitres(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es'
|
language = 'es_AR'
|
||||||
|
|
||||||
lang = 'es-AR'
|
lang = 'es-AR'
|
||||||
direction = 'ltr'
|
direction = 'ltr'
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
vijesti.me
|
vijesti.me
|
||||||
@ -18,12 +18,16 @@ class Vijesti(BasicNewsRecipe):
|
|||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 150
|
max_articles_per_feed = 150
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'cp1250'
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'sr'
|
language = 'sr'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url = 'http://www.vijesti.me/img/logo.gif'
|
extra_css = """
|
||||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
|
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: Georgia,"Times New Roman",Times,serif1,serif}
|
||||||
|
.articledescription,.article,.chapter{font-family: sans1, sans-serif}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -34,11 +38,11 @@ class Vijesti(BasicNewsRecipe):
|
|||||||
|
|
||||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'mainnews'})]
|
keep_only_tags = [dict(name='div', attrs={'id':['article_intro_text','article_text']})]
|
||||||
|
|
||||||
remove_tags = [dict(name=['object','link','embed','form'])]
|
remove_tags = [dict(name=['object','link','embed','form'])]
|
||||||
|
|
||||||
feeds = [(u'Sve vijesti', u'http://www.vijesti.me/rss.php' )]
|
feeds = [(u'Sve vijesti', u'http://www.vijesti.me/rss/' )]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
return self.adeify_images(soup)
|
return self.adeify_images(soup)
|
||||||
|
@ -35,7 +35,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_tags_before = dict(name='h1')
|
remove_tags_before = dict(name='h1')
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]),
|
dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow","articleTabs_tab_quotes","articleTabs_tab_document"]),
|
||||||
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
|
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
|
||||||
dict(rel='shortcut icon'),
|
dict(rel='shortcut icon'),
|
||||||
]
|
]
|
||||||
@ -101,7 +101,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
title = 'Front Section'
|
title = 'Front Section'
|
||||||
url = 'http://online.wsj.com' + a['href']
|
url = 'http://online.wsj.com' + a['href']
|
||||||
feeds = self.wsj_add_feed(feeds,title,url)
|
feeds = self.wsj_add_feed(feeds,title,url)
|
||||||
title = 'What''s News'
|
title = "What's News"
|
||||||
url = url.replace('pageone','whatsnews')
|
url = url.replace('pageone','whatsnews')
|
||||||
feeds = self.wsj_add_feed(feeds,title,url)
|
feeds = self.wsj_add_feed(feeds,title,url)
|
||||||
else:
|
else:
|
||||||
@ -131,6 +131,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
'description':desc, 'date':''})
|
'description':desc, 'date':''})
|
||||||
|
|
||||||
self.log('\tFound WN article:', title)
|
self.log('\tFound WN article:', title)
|
||||||
|
self.log('\t\t', desc)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
@ -157,17 +158,23 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
meta = a.find(attrs={'class':'meta_sectionName'})
|
meta = a.find(attrs={'class':'meta_sectionName'})
|
||||||
if meta is not None:
|
if meta is not None:
|
||||||
meta.extract()
|
meta.extract()
|
||||||
title = self.tag_to_string(a).strip() + ' [%s]'%self.tag_to_string(meta)
|
meta = self.tag_to_string(meta).strip()
|
||||||
|
if meta:
|
||||||
|
title = self.tag_to_string(a).strip() + ' [%s]'%meta
|
||||||
|
else:
|
||||||
|
title = self.tag_to_string(a).strip()
|
||||||
url = 'http://online.wsj.com'+a['href']
|
url = 'http://online.wsj.com'+a['href']
|
||||||
desc = ''
|
desc = ''
|
||||||
p = container.find('p')
|
for p in container.findAll('p'):
|
||||||
if p is not None:
|
|
||||||
desc = self.tag_to_string(p)
|
desc = self.tag_to_string(p)
|
||||||
|
if not 'Subscriber Content' in desc:
|
||||||
|
break
|
||||||
|
|
||||||
articles.append({'title':title, 'url':url,
|
articles.append({'title':title, 'url':url,
|
||||||
'description':desc, 'date':''})
|
'description':desc, 'date':''})
|
||||||
|
|
||||||
self.log('\tFound article:', title)
|
self.log('\tFound article:', title)
|
||||||
|
self.log('\t\t', desc)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
|
@ -10,7 +10,10 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'Wall Street Journal (free)'
|
title = 'Wall Street Journal (free)'
|
||||||
__author__ = 'Kovid Goyal, Sujata Raman, Joshua Oster-Morris, Starson17'
|
__author__ = 'Kovid Goyal, Sujata Raman, Joshua Oster-Morris, Starson17'
|
||||||
description = 'News and current affairs'
|
description = '''News and current affairs. This recipe only fetches complete
|
||||||
|
versions of the articles that are available free on the wsj.com website.
|
||||||
|
To get the rest of the articles, subscribe to the WSJ and use the other WSJ
|
||||||
|
recipe.'''
|
||||||
language = 'en'
|
language = 'en'
|
||||||
cover_url = 'http://dealbreaker.com/images/thumbs/Wall%20Street%20Journal%20A1.JPG'
|
cover_url = 'http://dealbreaker.com/images/thumbs/Wall%20Street%20Journal%20A1.JPG'
|
||||||
max_articles_per_feed = 1000
|
max_articles_per_feed = 1000
|
||||||
@ -137,12 +140,17 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
meta = a.find(attrs={'class':'meta_sectionName'})
|
meta = a.find(attrs={'class':'meta_sectionName'})
|
||||||
if meta is not None:
|
if meta is not None:
|
||||||
meta.extract()
|
meta.extract()
|
||||||
title = self.tag_to_string(a).strip() + ' [%s]'%self.tag_to_string(meta)
|
meta = self.tag_to_string(meta).strip()
|
||||||
|
if meta:
|
||||||
|
title = self.tag_to_string(a).strip() + ' [%s]'%meta
|
||||||
|
else:
|
||||||
|
title = self.tag_to_string(a).strip()
|
||||||
url = 'http://online.wsj.com'+a['href']
|
url = 'http://online.wsj.com'+a['href']
|
||||||
desc = ''
|
desc = ''
|
||||||
p = container.find('p')
|
for p in container.findAll('p'):
|
||||||
if p is not None:
|
|
||||||
desc = self.tag_to_string(p)
|
desc = self.tag_to_string(p)
|
||||||
|
if not 'Subscriber Content' in desc:
|
||||||
|
break
|
||||||
|
|
||||||
articles.append({'title':title, 'url':url,
|
articles.append({'title':title, 'url':url,
|
||||||
'description':desc, 'date':''})
|
'description':desc, 'date':''})
|
||||||
@ -151,6 +159,4 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
def cleanup(self):
|
|
||||||
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
|
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
"re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n return re.sub(pattern, replacement, val)\n",
|
"re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n return re.sub(pattern, replacement, val)\n",
|
||||||
"add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x + y)\n",
|
"add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n x = float(x if x else 0)\n y = float(y if y else 0)\n return unicode(x + y)\n",
|
||||||
"lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if len(args) == 2: # here for backwards compatibility\n if val:\n return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n else:\n return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n if (len(args) % 2) != 1:\n raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n if re.search(args[i], val):\n return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n i += 2\n",
|
"lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n if len(args) == 2: # here for backwards compatibility\n if val:\n return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n else:\n return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n if (len(args) % 2) != 1:\n raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n i = 0\n while i < len(args):\n if i + 1 >= len(args):\n return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n if re.search(args[i], val):\n return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n i += 2\n",
|
||||||
"template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n template = template.replace('[[', '{').replace(']]', '}')\n return formatter.safe_format(template, kwargs, 'TEMPLATE', mi)\n",
|
"template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n template = template.replace('[[', '{').replace(']]', '}')\n return formatter.__class__().safe_format(template, kwargs, 'TEMPLATE', mi)\n",
|
||||||
"print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n print args\n return None\n",
|
"print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n print args\n return None\n",
|
||||||
"titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return titlecase(val)\n",
|
"titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n return titlecase(val)\n",
|
||||||
"test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n if val:\n return value_if_set\n else:\n return value_not_set\n",
|
"test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n if val:\n return value_if_set\n else:\n return value_not_set\n",
|
||||||
|
@ -360,6 +360,9 @@ class LinuxFreeze(Command):
|
|||||||
def main():
|
def main():
|
||||||
try:
|
try:
|
||||||
sys.argv[0] = sys.calibre_basename
|
sys.argv[0] = sys.calibre_basename
|
||||||
|
dfv = os.environ.get('CALIBRE_DEVELOP_FROM', None)
|
||||||
|
if dfv and os.path.exists(dfv):
|
||||||
|
sys.path.insert(0, os.path.abspath(dfv))
|
||||||
set_default_encoding()
|
set_default_encoding()
|
||||||
set_helper()
|
set_helper()
|
||||||
set_qt_plugin_path()
|
set_qt_plugin_path()
|
||||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
__appname__ = 'calibre'
|
__appname__ = 'calibre'
|
||||||
__version__ = '0.7.43'
|
__version__ = '0.7.44'
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
@ -474,7 +474,7 @@ from calibre.devices.binatone.driver import README
|
|||||||
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
|
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
|
||||||
from calibre.devices.edge.driver import EDGE
|
from calibre.devices.edge.driver import EDGE
|
||||||
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
|
from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
|
||||||
SOVOS, PICO, SUNSTECH_EB700
|
SOVOS, PICO, SUNSTECH_EB700, ARCHOS7O
|
||||||
from calibre.devices.sne.driver import SNE
|
from calibre.devices.sne.driver import SNE
|
||||||
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
|
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
|
||||||
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \
|
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD, ALURATEK_COLOR, \
|
||||||
@ -581,7 +581,7 @@ plugins += [
|
|||||||
ELONEX,
|
ELONEX,
|
||||||
TECLAST_K3,
|
TECLAST_K3,
|
||||||
NEWSMY,
|
NEWSMY,
|
||||||
PICO, SUNSTECH_EB700,
|
PICO, SUNSTECH_EB700, ARCHOS7O,
|
||||||
IPAPYRUS,
|
IPAPYRUS,
|
||||||
SOVOS,
|
SOVOS,
|
||||||
EDGE,
|
EDGE,
|
||||||
|
@ -22,13 +22,15 @@ Run an embedded python interpreter.
|
|||||||
parser.add_option('-d', '--debug-device-driver', default=False, action='store_true',
|
parser.add_option('-d', '--debug-device-driver', default=False, action='store_true',
|
||||||
help='Debug the specified device driver.')
|
help='Debug the specified device driver.')
|
||||||
parser.add_option('-g', '--gui', default=False, action='store_true',
|
parser.add_option('-g', '--gui', default=False, action='store_true',
|
||||||
help='Run the GUI',)
|
help='Run the GUI with debugging enabled. Debug output is '
|
||||||
|
'printed to stdout and stderr.')
|
||||||
parser.add_option('--gui-debug', default=None,
|
parser.add_option('--gui-debug', default=None,
|
||||||
help='Run the GUI with a debug console, logging to the'
|
help='Run the GUI with a debug console, logging to the'
|
||||||
' specified path',)
|
' specified path. For internal use only, use the -g'
|
||||||
|
' option to run the GUI in debug mode',)
|
||||||
parser.add_option('--show-gui-debug', default=None,
|
parser.add_option('--show-gui-debug', default=None,
|
||||||
help='Display the specified log file.',)
|
help='Display the specified log file. For internal use'
|
||||||
|
' only.',)
|
||||||
parser.add_option('-w', '--viewer', default=False, action='store_true',
|
parser.add_option('-w', '--viewer', default=False, action='store_true',
|
||||||
help='Run the ebook viewer',)
|
help='Run the ebook viewer',)
|
||||||
parser.add_option('--paths', default=False, action='store_true',
|
parser.add_option('--paths', default=False, action='store_true',
|
||||||
|
@ -183,9 +183,8 @@ class BOOQ(EB600):
|
|||||||
|
|
||||||
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'pdf', 'doc', 'rtf', 'txt', 'html']
|
FORMATS = ['epub', 'mobi', 'prc', 'fb2', 'pdf', 'doc', 'rtf', 'txt', 'html']
|
||||||
|
|
||||||
VENDOR_NAME = 'NETRONIX'
|
VENDOR_NAME = ['NETRONIX', '36LBOOKS']
|
||||||
WINDOWS_MAIN_MEM = 'EB600'
|
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['EB600', 'ELEQTOR']
|
||||||
WINDOWS_CARD_A_MEM = 'EB600'
|
|
||||||
|
|
||||||
class MENTOR(EB600):
|
class MENTOR(EB600):
|
||||||
|
|
||||||
|
@ -35,6 +35,16 @@ class DevicePlugin(Plugin):
|
|||||||
|
|
||||||
#: Height for thumbnails on the device
|
#: Height for thumbnails on the device
|
||||||
THUMBNAIL_HEIGHT = 68
|
THUMBNAIL_HEIGHT = 68
|
||||||
|
#: Width for thumbnails on the device. Setting this will force thumbnails
|
||||||
|
#: to this size, not preserving aspect ratio. If it is not set, then
|
||||||
|
#: the aspect ratio will be preserved and the thumbnail will be no higher
|
||||||
|
#: than THUMBNAIL_HEIGHT
|
||||||
|
# THUMBNAIL_WIDTH = 68
|
||||||
|
|
||||||
|
#: Set this to True if the device supports updating cover thumbnails during
|
||||||
|
#: sync_booklists. Setting it to true will ask device.py to refresh the
|
||||||
|
#: cover thumbnails during book matching
|
||||||
|
WANTS_UPDATED_THUMBNAILS = False
|
||||||
|
|
||||||
#: Whether the metadata on books can be set via the GUI.
|
#: Whether the metadata on books can be set via the GUI.
|
||||||
CAN_SET_METADATA = ['title', 'authors', 'collections']
|
CAN_SET_METADATA = ['title', 'authors', 'collections']
|
||||||
|
@ -89,21 +89,21 @@ class NOOK_COLOR(NOOK):
|
|||||||
BCD = [0x216]
|
BCD = [0x216]
|
||||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOK_DISK'
|
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOK_DISK'
|
||||||
|
|
||||||
EBOOK_DIR_MAIN = 'My Files/Books'
|
EBOOK_DIR_MAIN = 'My Files'
|
||||||
|
|
||||||
'''
|
|
||||||
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
def create_upload_path(self, path, mdata, fname, create_dirs=True):
|
||||||
filepath = NOOK.create_upload_path(self, path, mdata, fname,
|
filepath = NOOK.create_upload_path(self, path, mdata, fname,
|
||||||
create_dirs=create_dirs)
|
create_dirs=False)
|
||||||
edm = self.EBOOK_DIR_MAIN.replace('/', os.sep)
|
edm = self.EBOOK_DIR_MAIN
|
||||||
npath = os.path.join(edm, _('News')) + os.sep
|
subdir = 'Books'
|
||||||
if npath in filepath:
|
if mdata.tags:
|
||||||
filepath = filepath.replace(npath, os.sep.join('My Files',
|
if _('News') in mdata.tags:
|
||||||
'Magazines')+os.sep)
|
subdir = 'Magazines'
|
||||||
filedir = os.path.dirname(filepath)
|
filepath = filepath.replace(os.sep+edm+os.sep,
|
||||||
if create_dirs and not os.path.exists(filedir):
|
os.sep+edm+os.sep+subdir+os.sep)
|
||||||
os.makedirs(filedir)
|
filedir = os.path.dirname(filepath)
|
||||||
|
if create_dirs and not os.path.exists(filedir):
|
||||||
|
os.makedirs(filedir)
|
||||||
|
|
||||||
return filepath
|
return filepath
|
||||||
'''
|
|
||||||
|
|
||||||
|
@ -8,5 +8,5 @@ CACHE_XML = 'Sony Reader/database/cache.xml'
|
|||||||
CACHE_EXT = 'Sony Reader/database/cacheExt.xml'
|
CACHE_EXT = 'Sony Reader/database/cacheExt.xml'
|
||||||
|
|
||||||
MEDIA_THUMBNAIL = 'database/thumbnail'
|
MEDIA_THUMBNAIL = 'database/thumbnail'
|
||||||
CACHE_THUMBNAIL = 'Sony Reader/database/thumbnail'
|
CACHE_THUMBNAIL = 'Sony Reader/thumbnail'
|
||||||
|
|
||||||
|
@ -81,12 +81,19 @@ class PRS505(USBMS):
|
|||||||
_('Set this option to have separate book covers uploaded '
|
_('Set this option to have separate book covers uploaded '
|
||||||
'every time you connect your device. Unset this option if '
|
'every time you connect your device. Unset this option if '
|
||||||
'you have so many books on the reader that performance is '
|
'you have so many books on the reader that performance is '
|
||||||
'unacceptable.')
|
'unacceptable.'),
|
||||||
|
_('Preserve cover aspect ratio when building thumbnails') +
|
||||||
|
':::' +
|
||||||
|
_('Set this option if you want the cover thumbnails to have '
|
||||||
|
'the same aspect ratio (width to height) as the cover. '
|
||||||
|
'Unset it if you want the thumbnail to be the maximum size, '
|
||||||
|
'ignoring aspect ratio.')
|
||||||
]
|
]
|
||||||
EXTRA_CUSTOMIZATION_DEFAULT = [
|
EXTRA_CUSTOMIZATION_DEFAULT = [
|
||||||
', '.join(['series', 'tags']),
|
', '.join(['series', 'tags']),
|
||||||
False,
|
False,
|
||||||
False
|
False,
|
||||||
|
True
|
||||||
]
|
]
|
||||||
|
|
||||||
OPT_COLLECTIONS = 0
|
OPT_COLLECTIONS = 0
|
||||||
@ -96,7 +103,7 @@ class PRS505(USBMS):
|
|||||||
plugboard = None
|
plugboard = None
|
||||||
plugboard_func = None
|
plugboard_func = None
|
||||||
|
|
||||||
THUMBNAIL_HEIGHT = 200
|
THUMBNAIL_HEIGHT = 217
|
||||||
|
|
||||||
MAX_PATH_LEN = 201 # 250 - (max(len(CACHE_THUMBNAIL), len(MEDIA_THUMBNAIL)) +
|
MAX_PATH_LEN = 201 # 250 - (max(len(CACHE_THUMBNAIL), len(MEDIA_THUMBNAIL)) +
|
||||||
# len('main_thumbnail.jpg') + 1)
|
# len('main_thumbnail.jpg') + 1)
|
||||||
@ -138,6 +145,13 @@ class PRS505(USBMS):
|
|||||||
if not write_cache(self._card_b_prefix):
|
if not write_cache(self._card_b_prefix):
|
||||||
self._card_b_prefix = None
|
self._card_b_prefix = None
|
||||||
self.booklist_class.rebuild_collections = self.rebuild_collections
|
self.booklist_class.rebuild_collections = self.rebuild_collections
|
||||||
|
# Set the thumbnail width to the theoretical max if the user has asked
|
||||||
|
# that we do not preserve aspect ratio
|
||||||
|
if not self.settings().extra_customization[3]:
|
||||||
|
self.THUMBNAIL_WIDTH = 168
|
||||||
|
# Set WANTS_UPDATED_THUMBNAILS if the user has asked that thumbnails be
|
||||||
|
# updated on every connect
|
||||||
|
self.WANTS_UPDATED_THUMBNAILS = self.settings().extra_customization[2]
|
||||||
|
|
||||||
def get_device_information(self, end_session=True):
|
def get_device_information(self, end_session=True):
|
||||||
return (self.gui_name, '', '', '')
|
return (self.gui_name, '', '', '')
|
||||||
|
@ -41,6 +41,16 @@ class NEWSMY(TECLAST_K3):
|
|||||||
WINDOWS_MAIN_MEM = 'NEWSMY'
|
WINDOWS_MAIN_MEM = 'NEWSMY'
|
||||||
WINDOWS_CARD_A_MEM = 'USBDISK____SD'
|
WINDOWS_CARD_A_MEM = 'USBDISK____SD'
|
||||||
|
|
||||||
|
class ARCHOS7O(TECLAST_K3):
|
||||||
|
name = 'Archos 7O device interface'
|
||||||
|
gui_name = 'Archos'
|
||||||
|
description = _('Communicate with the Archos reader.')
|
||||||
|
|
||||||
|
FORMATS = ['epub', 'mobi', 'fb2', 'rtf', 'ap', 'html', 'pdf', 'txt']
|
||||||
|
|
||||||
|
VENDOR_NAME = 'ARCHOS'
|
||||||
|
WINDOWS_MAIN_MEM = 'USB-MSC'
|
||||||
|
|
||||||
class PICO(NEWSMY):
|
class PICO(NEWSMY):
|
||||||
name = 'Pico device interface'
|
name = 'Pico device interface'
|
||||||
gui_name = 'Pico'
|
gui_name = 'Pico'
|
||||||
|
@ -113,7 +113,7 @@ def render_html_svg_workaround(path_to_html, log, width=590, height=750):
|
|||||||
|
|
||||||
def render_html(path_to_html, width=590, height=750, as_xhtml=True):
|
def render_html(path_to_html, width=590, height=750, as_xhtml=True):
|
||||||
from PyQt4.QtWebKit import QWebPage
|
from PyQt4.QtWebKit import QWebPage
|
||||||
from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize
|
from PyQt4.Qt import QEventLoop, QPalette, Qt, QUrl, QSize
|
||||||
from calibre.gui2 import is_ok_to_use_qt
|
from calibre.gui2 import is_ok_to_use_qt
|
||||||
if not is_ok_to_use_qt(): return None
|
if not is_ok_to_use_qt(): return None
|
||||||
path_to_html = os.path.abspath(path_to_html)
|
path_to_html = os.path.abspath(path_to_html)
|
||||||
@ -127,8 +127,7 @@ def render_html(path_to_html, width=590, height=750, as_xhtml=True):
|
|||||||
page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
|
page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
|
||||||
loop = QEventLoop()
|
loop = QEventLoop()
|
||||||
renderer = HTMLRenderer(page, loop)
|
renderer = HTMLRenderer(page, loop)
|
||||||
page.connect(page, SIGNAL('loadFinished(bool)'), renderer,
|
page.loadFinished.connect(renderer, type=Qt.QueuedConnection)
|
||||||
Qt.QueuedConnection)
|
|
||||||
if as_xhtml:
|
if as_xhtml:
|
||||||
page.mainFrame().setContent(open(path_to_html, 'rb').read(),
|
page.mainFrame().setContent(open(path_to_html, 'rb').read(),
|
||||||
'application/xhtml+xml', QUrl.fromLocalFile(path_to_html))
|
'application/xhtml+xml', QUrl.fromLocalFile(path_to_html))
|
||||||
@ -136,6 +135,7 @@ def render_html(path_to_html, width=590, height=750, as_xhtml=True):
|
|||||||
page.mainFrame().load(QUrl.fromLocalFile(path_to_html))
|
page.mainFrame().load(QUrl.fromLocalFile(path_to_html))
|
||||||
loop.exec_()
|
loop.exec_()
|
||||||
renderer.loop = renderer.page = None
|
renderer.loop = renderer.page = None
|
||||||
|
page.loadFinished.disconnect()
|
||||||
del page
|
del page
|
||||||
del loop
|
del loop
|
||||||
if isinstance(renderer.exception, ParserError) and as_xhtml:
|
if isinstance(renderer.exception, ParserError) and as_xhtml:
|
||||||
|
@ -139,6 +139,13 @@ class CHMReader(CHMFile):
|
|||||||
if self.hhc_path not in files and files:
|
if self.hhc_path not in files and files:
|
||||||
self.hhc_path = files[0]
|
self.hhc_path = files[0]
|
||||||
|
|
||||||
|
if self.hhc_path == '.hhc' and self.hhc_path not in files:
|
||||||
|
from calibre import walk
|
||||||
|
for x in walk(output_dir):
|
||||||
|
if os.path.basename(x).lower() in ('index.htm', 'index.html'):
|
||||||
|
self.hhc_path = os.path.relpath(x, output_dir)
|
||||||
|
break
|
||||||
|
|
||||||
def _reformat(self, data, htmlpath):
|
def _reformat(self, data, htmlpath):
|
||||||
try:
|
try:
|
||||||
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
|
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
|
||||||
|
@ -53,7 +53,7 @@ def find_pages(dir, sort_on_mtime=False, verbose=False):
|
|||||||
prints('\t'+'\n\t'.join([os.path.basename(p) for p in pages]))
|
prints('\t'+'\n\t'.join([os.path.basename(p) for p in pages]))
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
class PageProcessor(list):
|
class PageProcessor(list): # {{{
|
||||||
'''
|
'''
|
||||||
Contains the actual image rendering logic. See :method:`render` and
|
Contains the actual image rendering logic. See :method:`render` and
|
||||||
:method:`process_pages`.
|
:method:`process_pages`.
|
||||||
@ -111,6 +111,13 @@ class PageProcessor(list):
|
|||||||
|
|
||||||
SCRWIDTH, SCRHEIGHT = self.opts.output_profile.comic_screen_size
|
SCRWIDTH, SCRHEIGHT = self.opts.output_profile.comic_screen_size
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.opts.comic_image_size:
|
||||||
|
SCRWIDTH, SCRHEIGHT = map(int, [x.strip() for x in
|
||||||
|
self.opts.comic_image_size.split('x')])
|
||||||
|
except:
|
||||||
|
pass # Ignore
|
||||||
|
|
||||||
if self.opts.keep_aspect_ratio:
|
if self.opts.keep_aspect_ratio:
|
||||||
# Preserve the aspect ratio by adding border
|
# Preserve the aspect ratio by adding border
|
||||||
aspect = float(sizex) / float(sizey)
|
aspect = float(sizex) / float(sizey)
|
||||||
@ -170,6 +177,7 @@ class PageProcessor(list):
|
|||||||
dest = dest[:-1]
|
dest = dest[:-1]
|
||||||
os.rename(dest+'8', dest)
|
os.rename(dest+'8', dest)
|
||||||
self.append(dest)
|
self.append(dest)
|
||||||
|
# }}}
|
||||||
|
|
||||||
def render_pages(tasks, dest, opts, notification=lambda x, y: x):
|
def render_pages(tasks, dest, opts, notification=lambda x, y: x):
|
||||||
'''
|
'''
|
||||||
@ -291,7 +299,11 @@ class ComicInput(InputFormatPlugin):
|
|||||||
OptionRecommendation(name='no_process', recommended_value=False,
|
OptionRecommendation(name='no_process', recommended_value=False,
|
||||||
help=_("Apply no processing to the image")),
|
help=_("Apply no processing to the image")),
|
||||||
OptionRecommendation(name='dont_grayscale', recommended_value=False,
|
OptionRecommendation(name='dont_grayscale', recommended_value=False,
|
||||||
help=_('Do not convert the image to grayscale (black and white)'))
|
help=_('Do not convert the image to grayscale (black and white)')),
|
||||||
|
OptionRecommendation(name='comic_image_size', recommended_value=None,
|
||||||
|
help=_('Specify the image size as widthxheight pixels. Normally,'
|
||||||
|
' an image size is automatically calculated from the output '
|
||||||
|
'profile, this option overrides it.')),
|
||||||
])
|
])
|
||||||
|
|
||||||
recommendations = set([
|
recommendations = set([
|
||||||
|
@ -46,7 +46,8 @@ HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
|||||||
'italicize_common_cases', 'fix_indents',
|
'italicize_common_cases', 'fix_indents',
|
||||||
'html_unwrap_factor', 'unwrap_lines',
|
'html_unwrap_factor', 'unwrap_lines',
|
||||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||||
'dehyphenate', 'renumber_headings']
|
'dehyphenate', 'renumber_headings',
|
||||||
|
'replace_scene_breaks']
|
||||||
|
|
||||||
def print_help(parser, log):
|
def print_help(parser, log):
|
||||||
help = parser.format_help().encode(preferred_encoding, 'replace')
|
help = parser.format_help().encode(preferred_encoding, 'replace')
|
||||||
|
@ -531,6 +531,11 @@ OptionRecommendation(name='format_scene_breaks',
|
|||||||
'Replace soft scene breaks that use multiple blank lines with'
|
'Replace soft scene breaks that use multiple blank lines with'
|
||||||
'horizontal rules.')),
|
'horizontal rules.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='replace_scene_breaks',
|
||||||
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
|
help=_('Replace scene breaks with the specified text. By default, the '
|
||||||
|
'text from the input document is used.')),
|
||||||
|
|
||||||
OptionRecommendation(name='dehyphenate',
|
OptionRecommendation(name='dehyphenate',
|
||||||
recommended_value=True, level=OptionRecommendation.LOW,
|
recommended_value=True, level=OptionRecommendation.LOW,
|
||||||
help=_('Analyze hyphenated words throughout the document. The '
|
help=_('Analyze hyphenated words throughout the document. The '
|
||||||
|
@ -24,10 +24,16 @@ class HeuristicProcessor(object):
|
|||||||
self.chapters_no_title = 0
|
self.chapters_no_title = 0
|
||||||
self.chapters_with_title = 0
|
self.chapters_with_title = 0
|
||||||
self.blanks_deleted = False
|
self.blanks_deleted = False
|
||||||
|
self.blanks_between_paragraphs = False
|
||||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.softbreak = re.compile(r'\s*(?P<openline><p(?=\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||||
|
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
|
||||||
|
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||||
|
self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||||
|
self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
|
||||||
|
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
@ -42,8 +48,10 @@ class HeuristicProcessor(object):
|
|||||||
" chapters. - " + unicode(chap))
|
" chapters. - " + unicode(chap))
|
||||||
return '<h2>'+chap+'</h2>\n'
|
return '<h2>'+chap+'</h2>\n'
|
||||||
else:
|
else:
|
||||||
txt_chap = html2text(chap)
|
delete_whitespace = re.compile('^\s*(?P<c>.*?)\s*$')
|
||||||
txt_title = html2text(title)
|
delete_quotes = re.compile('\'\"')
|
||||||
|
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(chap)))
|
||||||
|
txt_title = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(title)))
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
||||||
@ -184,19 +192,17 @@ class HeuristicProcessor(object):
|
|||||||
|
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = "(?=<(p|div))"
|
init_lookahead = "(?=<(p|div))"
|
||||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
chapter_line_open = self.line_open
|
||||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||||
chapter_header_open = r"(?P<chap>"
|
chapter_header_open = r"(?P<chap>"
|
||||||
title_header_open = r"(?P<title>"
|
title_header_open = r"(?P<title>"
|
||||||
chapter_header_close = ")\s*"
|
chapter_header_close = ")\s*"
|
||||||
title_header_close = ")"
|
title_header_close = ")"
|
||||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
chapter_line_close = self.line_close
|
||||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||||
|
|
||||||
is_pdftohtml = self.is_pdftohtml(html)
|
is_pdftohtml = self.is_pdftohtml(html)
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
|
||||||
chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
|
||||||
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
|
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
|
||||||
title_line_close = "\s*</(?P=outer2)>"
|
title_line_close = "\s*</(?P=outer2)>"
|
||||||
|
|
||||||
@ -371,13 +377,17 @@ class HeuristicProcessor(object):
|
|||||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
# Delete microsoft 'smart' tags
|
# Delete microsoft 'smart' tags
|
||||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||||
# Delete self closing paragraph tags
|
# Re-open self closing paragraph tags
|
||||||
html = re.sub('<p\s?/>', '', html)
|
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
||||||
# Get rid of empty span, bold, font, em, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
|
||||||
|
# delete surrounding divs from empty paragraphs
|
||||||
|
html = re.sub('<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
|
||||||
|
# Empty heading tags
|
||||||
|
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
|
||||||
self.deleted_nbsps = True
|
self.deleted_nbsps = True
|
||||||
return html
|
return html
|
||||||
|
|
||||||
@ -416,10 +426,99 @@ class HeuristicProcessor(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def merge_blanks(self, html, blanks_count=None):
|
||||||
|
base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
|
||||||
|
em_per_line = 1.5 # Add another 1.5 em for each additional blank
|
||||||
|
|
||||||
|
def merge_matches(match):
|
||||||
|
to_merge = match.group(0)
|
||||||
|
lines = float(len(self.single_blank.findall(to_merge))) - 1.
|
||||||
|
em = base_em + (em_per_line * lines)
|
||||||
|
if to_merge.find('whitespace'):
|
||||||
|
newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
|
||||||
|
else:
|
||||||
|
newline = self.any_multi_blank.sub('\n<p class="softbreak'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
|
||||||
|
return newline
|
||||||
|
|
||||||
|
html = self.any_multi_blank.sub(merge_matches, html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def detect_whitespace(self, html):
|
||||||
|
blanks_around_headings = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
|
||||||
|
blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
|
||||||
|
|
||||||
|
def merge_header_whitespace(match):
|
||||||
|
initblanks = match.group('initparas')
|
||||||
|
endblanks = match.group('initparas')
|
||||||
|
heading = match.group('heading')
|
||||||
|
top_margin = ''
|
||||||
|
bottom_margin = ''
|
||||||
|
if initblanks is not None:
|
||||||
|
top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||||
|
if endblanks is not None:
|
||||||
|
bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
|
||||||
|
|
||||||
|
if initblanks == None and endblanks == None:
|
||||||
|
return heading
|
||||||
|
else:
|
||||||
|
heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
|
||||||
|
return heading
|
||||||
|
|
||||||
|
html = blanks_around_headings.sub(merge_header_whitespace, html)
|
||||||
|
|
||||||
|
def markup_whitespaces(match):
|
||||||
|
blanks = match.group(0)
|
||||||
|
blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
|
||||||
|
return blanks
|
||||||
|
|
||||||
|
html = blanks_n_nopunct.sub(markup_whitespaces, html)
|
||||||
|
if self.html_preprocess_sections > self.min_chapters:
|
||||||
|
html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
|
def detect_soft_breaks(self, html):
|
||||||
|
if not self.blanks_deleted and self.blanks_between_paragraphs:
|
||||||
|
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
|
else:
|
||||||
|
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def markup_user_break(self, replacement_break):
|
||||||
|
'''
|
||||||
|
Takes string a user supplies and wraps it in markup that will be centered with
|
||||||
|
appropriate margins. <hr> and <img> tags are allowed. If the user specifies
|
||||||
|
a style with width attributes in the <hr> tag then the appropriate margins are
|
||||||
|
applied to wrapping divs. This is because many ebook devices don't support margin:auto
|
||||||
|
All other html is converted to text.
|
||||||
|
'''
|
||||||
|
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
|
||||||
|
if re.findall('(<|>)', replacement_break):
|
||||||
|
if re.match('^<hr', replacement_break):
|
||||||
|
if replacement_break.find('width') != -1:
|
||||||
|
width = int(re.sub('.*?width(:|=)(?P<wnum>\d+).*', '\g<wnum>', replacement_break))
|
||||||
|
replacement_break = re.sub('(?i)(width=\d+\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break)
|
||||||
|
divpercent = (100 - width) / 2
|
||||||
|
hr_open = re.sub('45', str(divpercent), hr_open)
|
||||||
|
scene_break = hr_open+replacement_break+'</div>'
|
||||||
|
else:
|
||||||
|
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||||
|
elif re.match('^<img', replacement_break):
|
||||||
|
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||||
|
else:
|
||||||
|
from calibre.utils.html2text import html2text
|
||||||
|
replacement_break = html2text(replacement_break)
|
||||||
|
replacement_break = re.sub('\s', ' ', replacement_break)
|
||||||
|
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||||
|
else:
|
||||||
|
replacement_break = re.sub('\s', ' ', replacement_break)
|
||||||
|
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||||
|
|
||||||
|
return scene_break
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, html):
|
def __call__(self, html):
|
||||||
self.log.debug("********* Heuristic processing HTML *********")
|
self.log.debug("********* Heuristic processing HTML *********")
|
||||||
|
|
||||||
# Count the words in the document to estimate how many chapters to look for and whether
|
# Count the words in the document to estimate how many chapters to look for and whether
|
||||||
# other types of processing are attempted
|
# other types of processing are attempted
|
||||||
try:
|
try:
|
||||||
@ -433,7 +532,7 @@ class HeuristicProcessor(object):
|
|||||||
|
|
||||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
html = self.arrange_htm_line_endings(html)
|
html = self.arrange_htm_line_endings(html)
|
||||||
|
#self.dump(html, 'after_arrange_line_endings')
|
||||||
if self.cleanup_required():
|
if self.cleanup_required():
|
||||||
###### Check Markup ######
|
###### Check Markup ######
|
||||||
#
|
#
|
||||||
@ -453,27 +552,32 @@ class HeuristicProcessor(object):
|
|||||||
# fix indents must run before this step, as it removes non-breaking spaces
|
# fix indents must run before this step, as it removes non-breaking spaces
|
||||||
html = self.cleanup_markup(html)
|
html = self.cleanup_markup(html)
|
||||||
|
|
||||||
|
is_pdftohtml = self.is_pdftohtml(html)
|
||||||
|
if is_pdftohtml:
|
||||||
|
self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
||||||
|
self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
||||||
|
|
||||||
# ADE doesn't render <br />, change to empty paragraphs
|
# ADE doesn't render <br />, change to empty paragraphs
|
||||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||||
|
|
||||||
# Determine whether the document uses interleaved blank lines
|
# Determine whether the document uses interleaved blank lines
|
||||||
blanks_between_paragraphs = self.analyze_blanks(html)
|
self.blanks_between_paragraphs = self.analyze_blanks(html)
|
||||||
|
|
||||||
#self.dump(html, 'before_chapter_markup')
|
|
||||||
# detect chapters/sections to match xpath or splitting logic
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
|
||||||
|
#self.dump(html, 'after_chapter_markup')
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||||
html = self.markup_italicis(html)
|
html = self.markup_italicis(html)
|
||||||
|
|
||||||
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||||
# blank paragraphs then delete blank lines to clean up spacing
|
# blank paragraphs then delete blank lines to clean up spacing
|
||||||
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
self.log.debug("deleting blank lines")
|
self.log.debug("deleting blank lines")
|
||||||
self.blanks_deleted = True
|
self.blanks_deleted = True
|
||||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||||
html = self.blankreg.sub('', html)
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
# Determine line ending type
|
# Determine line ending type
|
||||||
@ -514,7 +618,7 @@ class HeuristicProcessor(object):
|
|||||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
self.log.debug("Looking for more split points based on punctuation,"
|
self.log.debug("Looking for more split points based on punctuation,"
|
||||||
" currently have " + unicode(self.html_preprocess_sections))
|
" currently have " + unicode(self.html_preprocess_sections))
|
||||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||||
@ -524,15 +628,32 @@ class HeuristicProcessor(object):
|
|||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
|
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
|
||||||
|
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
|
||||||
|
# Multiple sequential blank paragraphs are merged with appropriate margins
|
||||||
|
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
|
||||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
# Center separator lines
|
html = self.detect_whitespace(html)
|
||||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
html = self.detect_soft_breaks(html)
|
||||||
if not self.blanks_deleted:
|
blanks_count = len(self.any_multi_blank.findall(html))
|
||||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
if blanks_count >= 1:
|
||||||
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
|
html = self.merge_blanks(html, blanks_count)
|
||||||
|
scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
|
||||||
|
scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||||
|
# If the user has enabled scene break replacement, then either softbreaks
|
||||||
|
# or 'hard' scene breaks are replaced, depending on which is in use
|
||||||
|
# Otherwise separator lines are centered, use a bit larger margin in this case
|
||||||
|
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
|
||||||
|
if replacement_break:
|
||||||
|
replacement_break = self.markup_user_break(replacement_break)
|
||||||
|
if len(scene_break.findall(html)) >= 1:
|
||||||
|
html = scene_break.sub(replacement_break, html)
|
||||||
|
else:
|
||||||
|
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
|
||||||
|
else:
|
||||||
|
html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
|
||||||
|
|
||||||
if self.deleted_nbsps:
|
if self.deleted_nbsps:
|
||||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
# put back non-breaking spaces in empty paragraphs so they render correctly
|
||||||
html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||||
html = self.softbreak.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
|
||||||
return html
|
return html
|
||||||
|
@ -175,6 +175,19 @@ class EPUBInput(InputFormatPlugin):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
'EPUB files with DTBook markup are not supported')
|
'EPUB files with DTBook markup are not supported')
|
||||||
|
|
||||||
|
for x in list(opf.iterspine()):
|
||||||
|
ref = x.get('idref', None)
|
||||||
|
if ref is None:
|
||||||
|
x.getparent().remove(x)
|
||||||
|
continue
|
||||||
|
for y in opf.itermanifest():
|
||||||
|
if y.get('id', None) == ref and y.get('media-type', None) in \
|
||||||
|
('application/vnd.adobe-page-template+xml',):
|
||||||
|
p = x.getparent()
|
||||||
|
if p is not None:
|
||||||
|
p.remove(x)
|
||||||
|
break
|
||||||
|
|
||||||
with open('content.opf', 'wb') as nopf:
|
with open('content.opf', 'wb') as nopf:
|
||||||
nopf.write(opf.render())
|
nopf.write(opf.render())
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user