git subrepo pull modules

subrepo: subdir: "modules" merged: "ed88343f4d" upstream: origin: "[email protected]:daisy/pipeline-modules.git" branch: "master" commit: "aec26b7fbc" git-subrepo: version: "0.3.1" origin: "???" commit: "???"
daisy · Mar 26, 2024 · 190fb74 · 190fb74
1 parent c4698a0
commit 190fb74
Show file tree

Hide file tree

Showing 52 changed files with 4,254 additions and 7 deletions.
diff --git a/modules/.gitrepo b/modules/.gitrepo
@@ -7,5 +7,5 @@
 	remote = [email protected]:daisy/pipeline-modules.git
 	branch = master
 	commit = aec26b7fbcf3f72681e85e4807b98112c59cb505
-	parent = 7b6dc20e0deda4199a09d45da6f52110467954a9
+	parent = 0986f76b352959fd39fb2796d4ac6cc5390dee0d
 	cmdver = 0.3.1
diff --git a/modules/bom/pom.xml b/modules/bom/pom.xml
@@ -235,7 +235,7 @@
       <dependency>
         <groupId>org.daisy.pipeline.modules</groupId>
         <artifactId>dtbook-utils</artifactId>
-        <version>4.1.1</version>
+        <version>4.2.0-SNAPSHOT</version>
       </dependency>
       <dependency>
         <groupId>org.daisy.pipeline.modules</groupId>

diff --git a/modules/parent/pom.xml b/modules/parent/pom.xml
@@ -427,7 +427,7 @@
       <dependency>
         <groupId>org.daisy.pipeline</groupId>
         <artifactId>framework-bom</artifactId>
-        <version>1.14.16</version>
+        <version>1.14.17-SNAPSHOT</version>
         <type>pom</type>
         <scope>import</scope>
       </dependency>

diff --git a/modules/scripts-utils/dtbook-utils/doc/index.md b/modules/scripts-utils/dtbook-utils/doc/index.md
@@ -0,0 +1,24 @@
+<link rel="dp2:permalink" href="http://daisy.github.io/pipeline/Get-Help/User-Guide/Scripts/dtbook-cleaner/"/>
+<link rev="dp2:doc" href="../src/main/resources/xml/fix-dtbook/dtbook-cleaner.script.xpl"/>
+<link rel="rdf:type" href="http://www.daisy.org/ns/pipeline/userdoc"/>
+
+# DTBook Cleaner
+
+The "DTBook cleaner" script applies fixing routines to a DTBook XML
+file. Three sets of routines are provided and can be individually
+activated : a "repair" set, a "tidy" set and a "narrator" set.
+Optionally, the script can apply sentence detection on the DTBook
+file.
+
+More information on the fixes applied by the script for each set of
+routines is provided on the [Detailed cleaning routines](routines.md)
+page.
+
+## Table of contents
+
+{{>toc}}
+
+## Synopsis
+
+{{>synopsis}}
+
diff --git a/modules/scripts-utils/dtbook-utils/doc/routines.md b/modules/scripts-utils/dtbook-utils/doc/routines.md
@@ -0,0 +1,114 @@
+<link rel="rdf:type" href="http://www.daisy.org/ns/pipeline/userdoc"/>
+<meta property="dc:title" content="DTBook cleaning routine details"/>
+
+# DTBook cleaning routine details
+
+This page details the cleaning process applied by the DTBook Cleaner
+script for each set of routines provided.
+
+## Table of contents
+
+{{>toc}}
+
+-----------
+
+### Repair routines
+
+The "repair" set of routines repairs some structural errors when encountered :
+- Removes `levelx` if it has descendant headings of x-1 
+  (this simplifies later steps). <br/>
+  Note: Level normalizer cannot fix `level1/level2/level1`
+- Splits a level into several levels on every additional heading on the same
+  level
+- Add levels where needed.
+- Changes a `hx` into a `p` with `@class="hx"` if parent isn't `levelx`<br/>
+  Note: "Remove illegal headings" cannot handle `hx` in inline context.
+  Support for this could be added.
+- Removes nested `p`
+- Adds an empty `p`-tag if `hx` is the last element
+- Apply fixes for lists:
+    - wraps a list in `li` when the parent of the list is another list
+    - adds `@type` if missing (default value is `"pl"`)
+    - corrects `@depth` attribute
+    - removes `@enum` attribute if the list is not ordered
+    - removes `@start` attribute if the list is not ordered
+- `idref` must be present on `noteref` and `annoref`. Add `idref` if missing or 
+  change if empty.<br/>
+  The value of the `idref` must include a fragment identifier. Add a hash mark 
+  in the beginning of all idref attributes that don't contain a hash mark.
+- Removes
+    - empty/whitespace `p` except when
+        - receded by `hx` or no preceding element and parent is a level
+        - and followed only by other empty `p`
+    - empty/whitespace `em`, `strong`, `sub`, `sup`
+    - empty/whitespace elements that must have children.
+- Update the @page attribute to make it match the contents of the pagenum
+  element.
+    - If `@page="normal"` but the contents of the element doesn't match "normal"
+      content, the @page attribute is changed to:
+        - `@page="front"` if the contents is roman numerals and the pagenum
+          element is located in the frontmatter of the book
+        - `@page="special"` otherwise
+    - If `@page="front"` but the contents of the element doesn't match "front"
+      content (neither roman nor arabic numerals), the @page attribute is
+      changed to "special"
+- Fix metadata case errors
+    - remove unknown dc-metadata
+    - add `dtb:uid` (if missing) from dc:Identifier
+    - add `dc:Title` (if missing) from doctitle
+    - add auto-generated `dtb:uid` if missing (or if it has empty contents)
+
+### Tidy routines
+
+The "tidy" set of routines removes empty elements and reorganize misplaced
+elements:
+- Removes
+    - empty/whitespace `p` except when
+        - preceded by `hx` or no preceding element and parent is a level
+        - and followed only by other empty `p`
+    - empty/whitespace `em`, `strong`, `sub`, `sup`
+- Moves
+    - `pagenum` inside `h[x]` before `h[x]`
+    - `pagenum` inside a word after the word
+- Update the `@page` attribute to make it match the contents of the `pagenum`
+  element.
+    - If `@page="front"` but the contents of the element is an arabic number,
+      the `@page` attribute is changed to `"normal"`<br/>
+      (Note: arabic numbers are theoretically allowed from `@page="front"`, but
+      are not considered standard practice by many)
+    - If `@page="special"` but the element has no content, adds a dummy content
+      ("page break").
+- Removes otherwise empty `p` or `li` around `pagenum` (except `p` in `td`)
+- Inserts `docauthor` and `doctitle` if a `frontmatter` exists without those
+  elements
+- Removes existing whitespace nodes and indents output to aid debugging.
+    - Does not remove whitespace or apply indentation in inline context
+    - Does not apply indentation when number of children is 1
+
+### Narrator routines
+
+the "narrator" set of routines tries to optimize the dtbook for text-to-speech
+processes:
+- Adds `dc:Language`, `dc:Date` and `dc:Publisher` to dtbook, if not present in
+  input, or given but with null/whitespace only content values
+- Removes `dc:description` and `dc:subject` if not valued
+- Removes `dc:Format` (will be added by the fileset generator)
+- Prepare the DTBook for audio synthesis:
+    - Don't allow `h[x+1]` in `level[x+1]` unless `h[x]` in `li[x]` is present
+        - This fix assumes headings are not empty (e.g. empty headings were
+          removed by a previous fix)
+    - Every document needs at least one heading on `level1`
+        - This fix assumes headings are not empty (e.g. empty headings were 
+          removed by a previous fix)
+    - No `list` or `dl` inside `p` :
+        - Breaks the parent paragraph into a sequence of paragraphs, list and dl
+        - Each newly created paragraph has the same attributes as the original
+          one
+        - New paragraph IDs are created if necessary
+        - The original paragraph ID is conserved for the first paragraph created
+- Adds the `dc:Title` meta element and the `doctitle` frontmatter element, if
+  not present in input, or given but with null/whitespace only content values.
+  <br/>Title value is taken:
+    - from the `dc:Title` metadata if it is present
+    - or else from the first `doctitle` element in the `frontmatter`
+    - or else from the first heading 1.
diff --git a/modules/scripts-utils/dtbook-utils/pom.xml b/modules/scripts-utils/dtbook-utils/pom.xml
@@ -11,12 +11,26 @@
   </parent>
 
   <artifactId>dtbook-utils</artifactId>
-  <version>4.1.2-SNAPSHOT</version>
+  <version>4.2.0-SNAPSHOT</version>
   <packaging>bundle</packaging>
 
   <name>DAISY Pipeline 2 module :: DTBook Utils</name>
 
   <dependencies>
+    <!-- for XProcScriptService -->
+    <dependency>
+      <groupId>org.daisy.pipeline</groupId>
+      <artifactId>framework-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.daisy.pipeline</groupId>
+      <artifactId>saxon-adapter</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.daisy.pipeline</groupId>
+      <artifactId>ds-to-spi-runtime</artifactId>
+      <scope>provided</scope>
+    </dependency>
     <!--
         runtime dependencies
     -->
@@ -60,12 +74,27 @@
       <artifactId>mathml-utils</artifactId>
       <scope>runtime</scope>
     </dependency>
+    <dependency>
+      <groupId>org.daisy.pipeline.modules</groupId>
+      <artifactId>dtbook-break-detection</artifactId>
+      <scope>runtime</scope>
+    </dependency>
+    <!--
+        test dependencies
+    -->
+    <dependency>
+      <groupId>org.daisy.pipeline.modules</groupId>
+      <artifactId>nlp-omnilang-lexer</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <properties>
     <expose-services>
-      org.daisy.pipeline.modules.impl.Module_dtbook_utils
+      org.daisy.pipeline.modules.impl.Module_dtbook_utils,
+      org.daisy.pipeline.dtbook.saxon.impl.DTBookCleanerLibrary$Provider,
+      org.daisy.pipeline.script.impl.XProcScript_dtbook_cleaner
     </expose-services>
   </properties>
 
-</project>
+</project>
diff --git a/...dtbook-utils/src/main/java/org/daisy/pipeline/dtbook/saxon/impl/DTBookCleanerLibrary.java b/...dtbook-utils/src/main/java/org/daisy/pipeline/dtbook/saxon/impl/DTBookCleanerLibrary.java
@@ -0,0 +1,30 @@
+package org.daisy.pipeline.dtbook.saxon.impl;
+
+import java.util.Locale;
+import java.util.Date;
+import java.text.SimpleDateFormat;
+
+import org.daisy.common.xpath.saxon.ExtensionFunctionProvider;
+import org.daisy.common.xpath.saxon.ReflexiveExtensionFunctionProvider;
+import org.osgi.service.component.annotations.Component;
+
+public class DTBookCleanerLibrary {
+
+    @Component(
+        name = "DTBookCleanerLibrary",
+        service = { ExtensionFunctionProvider.class }
+    )
+    public static class Provider extends ReflexiveExtensionFunctionProvider {
+        public Provider() {
+            super(DTBookCleanerLibrary.class);
+        }
+    }
+
+    public static String getDefaultLocale() {
+        return Locale.getDefault().toString().replace('_', '-');
+    }
+
+    public static String getDate() {
+        return new SimpleDateFormat("yyyy-MM-dd").format(new Date());
+    }
+}
diff --git a/modules/scripts-utils/dtbook-utils/src/main/resources/META-INF/catalog.xml b/modules/scripts-utils/dtbook-utils/src/main/resources/META-INF/catalog.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<catalog xmlns="urn:oasis:names:tc:entity:xmlns:xml:catalog">
+<catalog xmlns="urn:oasis:names:tc:entity:xmlns:xml:catalog" xmlns:px="http://www.daisy.org/ns/pipeline">
 
    <public publicId='-//NISO//DTD dtbook 2005-1//EN' uri='../xml/dtd/dtbook-2005-1.dtd'/>
    <system systemId='http://www.daisy.org/z3986/2005/dtbook-2005-1.dtd' uri='../xml/dtd/dtbook-2005-1.dtd'/>
@@ -21,6 +21,8 @@
    <uri name="http://www.daisy.org/pipeline/modules/dtbook-utils/library.xpl"
         uri="../xml/dtbook-utils-library.xpl"/>
 
+   <uri uri="../xml/fix-dtbook/dtbook-cleaner.script.xpl" px:content-type="script" px:id="dtbook-cleaner"/>
+
    <nextCatalog catalog="org:daisy:pipeline:modules:fileset-utils"/>
    <nextCatalog catalog="org:daisy:pipeline:modules:mediatype-utils"/>
    <nextCatalog catalog="org:daisy:pipeline:modules:validation-utils"/>
@@ -29,4 +31,5 @@
    <nextCatalog catalog="org:daisy:pipeline:modules:css-utils" />
    <nextCatalog catalog="org:daisy:pipeline:modules:metadata-utils"/>
    <nextCatalog catalog="org:daisy:pipeline:modules:mathml-utils"/>
+   <nextCatalog catalog="org:daisy:pipeline:modules:dtbook-break-detection"/>
 </catalog>
diff --git a/modules/scripts-utils/dtbook-utils/src/main/resources/xml/fix-dtbook/doctyping.xpl b/modules/scripts-utils/dtbook-utils/src/main/resources/xml/fix-dtbook/doctyping.xpl
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<p:declare-step xmlns:p="http://www.w3.org/ns/xproc" version="1.0"
+                xmlns:px="http://www.daisy.org/ns/pipeline/xproc"
+                xmlns:pxi="http://www.daisy.org/ns/pipeline/xproc/internal"
+                xmlns:cx="http://xmlcalabash.com/ns/extensions"
+                xmlns:xs="http://www.w3.org/2001/XMLSchema"
+                type="pxi:dtbook-doctyping" name="main">
+
+    <p:input port="source" px:media-type="application/x-dtbook+xml" sequence="false">
+        <p:documentation xmlns="http://www.w3.org/1999/xhtml">
+            <p>A single DTBook document</p>
+        </p:documentation>
+    </p:input>
+    <p:output port="result" sequence="false">
+        <p:documentation xmlns="http://www.w3.org/1999/xhtml">
+            <p>The result DTBook document as serialized content</p>
+        </p:documentation>
+        <p:pipe port="result" step="add-doctype" />
+    </p:output>
+    <p:option name="css" required="false" select="''">
+        <p:documentation xmlns="http://www.w3.org/1999/xhtml">
+            <h2>CSS</h2>
+            <p>CSS stylesheet path relative to the document</p>
+        </p:documentation>
+    </p:option>
+
+    <p:xslt name="add-doctype" cx:serialize="true">
+        <p:input port="stylesheet">
+            <p:document href="xsl/export-doctype.xsl"/>
+        </p:input>
+        <p:with-param name="css" select="$css"/>
+    </p:xslt>
+
+</p:declare-step>