Handle image templates that don't use zero padding (cctbx#705)

* Use a single `#` character in a filename template to represent a non-zero-padded incremental number.
dials · Apr 18, 2024 · 24a50c7 · 24a50c7
1 parent dd77a69
commit 24a50c7
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 18 deletions.
diff --git a/newsfragments/705.feature b/newsfragments/705.feature
@@ -0,0 +1,10 @@
+The template handling mechanism is extended so that a template with a
+single ``#`` is expanded to match non-zero padded sequential numbers.
+For example, ``image_#.cbf`` will match ``image_1.cbf``, ``image_2.cbf``,
+..., ``image_10.cbf`` and so on.
+
+Using a single ``#`` to match up to 10 images _within_ a zero-padded
+sequence continues to work as before. For example,
+``dials.import template=insulin_1_01#.img`` will match the files
+``insulin_1_010.img``, ``insulin_1_011.img``, ..., ``insulin_1_019.img``,
+and no others.
diff --git a/src/dxtbx/imageset.py b/src/dxtbx/imageset.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import natsort
+
 import boost_adaptbx.boost.python
 
 import dxtbx.format.image  # noqa: F401, import dependency for unpickling
@@ -45,7 +47,9 @@
 )
 
 
-def _expand_template(template: str, indices: Iterable[int]) -> list[str]:
+def _expand_template_to_sorted_filenames(
+    template: str, indices: Iterable[int]
+) -> list[str]:
     """Expand a template string to a list of filenames.
 
     Args:
@@ -55,7 +59,13 @@ def _expand_template(template: str, indices: Iterable[int]) -> list[str]:
     pfx = template.split("#")[0]
     sfx = template.split("#")[-1]
     count = template.count("#")
-    return [f"{pfx}{index:0{count}}{sfx}" for index in indices]
+    if count == 1:
+        # Special handling for a template with a single "#", which does not
+        # assume a zero-padded index.
+        filenames = [f"{pfx}{index}{sfx}" for index in indices]
+    else:
+        filenames = [f"{pfx}{index:0{count}}{sfx}" for index in indices]
+    return natsort.natsorted(filenames)
 
 
 class MemReader:
@@ -449,7 +459,7 @@ def from_template(
 
             # Set the image range
             indices = range(image_range[0], image_range[1] + 1)
-            filenames = _expand_template(template, indices)
+            filenames = _expand_template_to_sorted_filenames(template, indices)
         else:
             if "master" not in template:
                 raise ValueError("Invalid template")
@@ -486,7 +496,7 @@ def _create_imageset(filelist, check_headers):
 
         # Get the template format
         if "#" in template:
-            filenames = sorted(_expand_template(template, indices))
+            filenames = _expand_template_to_sorted_filenames(template, indices)
         else:
             filenames = [template]
 
@@ -503,7 +513,7 @@ def _create_sequence(filelist, check_headers):
 
         # Expand the template if necessary
         if "#" in template:
-            filenames = sorted(_expand_template(template, indices))
+            filenames = _expand_template_to_sorted_filenames(template, indices)
         else:
             filenames = [template]
 
@@ -564,7 +574,7 @@ def make_sequence(
 
         # Get the template format
         if "#" in template:
-            filenames = sorted(_expand_template(template, indices))
+            filenames = _expand_template_to_sorted_filenames(template, indices)
         else:
             filenames = [template]
 

diff --git a/src/dxtbx/model/experiment_list.py b/src/dxtbx/model/experiment_list.py
@@ -915,8 +915,6 @@ def from_templates(templates, **kwargs):
                     f"Image file {filenames[0]} appears to be a '{type(format_class).__name__}', but this is an abstract Format"
                 )
             else:
-                index = slice(*template_string_number_index(template))
-
                 image_range = kwargs.get("image_range")
                 if image_range:
                     first, last = image_range
@@ -926,7 +924,13 @@ def from_templates(templates, **kwargs):
                 if not kwargs.get("allow_incomplete_sequences", False):
                     if "#" in template:
                         # Check all images in range are present - if allowed
-                        all_numbers = {int(f[index]) for f in filenames}
+                        i0, i1 = template_string_number_index(template)
+                        prefix = template[:i0]
+                        suffix = template[i1:]
+                        all_numbers = {
+                            int(f.replace(prefix, "").replace(suffix, ""))
+                            for f in filenames
+                        }
                         missing = set(range(first, last + 1)) - all_numbers
                         if missing:
                             raise ValueError(

diff --git a/src/dxtbx/model/scan_helpers.py b/src/dxtbx/model/scan_helpers.py
@@ -14,10 +14,11 @@
     r"([0-9]{2,12})\.(.*)",
     r"(.*)\.([0-9]{2,12})_(.*)",
     r"(.*)\.([0-9]{2,12})(.*)",
+    r"(.*)\.([0-9]{1,12})([^0]*)_(.*)",
     r"(.*)\.([0-9]{1})(.*)",
 ]
 
-joiners = [".", "_", "", ""]
+joiners = [".", "_", "", "_", ""]
 
 compiled_patterns = [re.compile(pattern) for pattern in patterns]
 
@@ -38,6 +39,10 @@ def template_regex(filename):
             exten = "." + groups[0][::-1]
             digits = groups[1][::-1]
             prefix = groups[2][::-1] + joiners[j]
+        elif len(groups) == 4:
+            exten = "." + groups[0][::-1]
+            digits = groups[1][::-1]
+            prefix = groups[3][::-1] + joiners[j] + groups[2][::-1]
         else:
             exten = ""
             digits = groups[0][::-1]

diff --git a/src/dxtbx/sequence_filenames.py b/src/dxtbx/sequence_filenames.py
@@ -5,6 +5,8 @@
 from collections import defaultdict
 from glob import glob
 
+import natsort
+
 
 def template_regex(filename):
     """Works out a template from a filename.
@@ -181,6 +183,9 @@ def replace_template_format_with_hash(match):
 
 def template_string_to_glob_expr(template):
     """Convert the template to a glob expression."""
+    if template.count("#") == 1:
+        # https://github.com/cctbx/dxtbx/issues/646
+        return template.replace("#", "*")
     return template.replace("#", "[0-9]")
 
 
@@ -191,27 +196,34 @@ def template_string_number_index(template):
 
 def locate_files_matching_template_string(template):
     """Return all files matching template."""
-    return glob(template_string_to_glob_expr(template))
+    matches = glob(template_string_to_glob_expr(template))
+    if template.count("#") != 1:
+        return matches
+    matches = [os.path.split(p) for p in matches]
+    i0, i1 = template_string_number_index(template)
+    suffix = template[i1:]
+    patt = re.compile("([^0]*)([0-9]+)" + suffix)
+    return [os.path.join(*m) for m in matches if patt.match(m[1])]
 
 
 def template_image_range(template):
     """Return the image range of files with this template."""
 
     # Find the files matching the template
     filenames = locate_files_matching_template_string(template)
-    filenames = sorted(filenames)
+    filenames = natsort.natsorted(filenames)
 
     # Check that the template matches some files
     if len(filenames) == 0:
         raise ValueError(f"Template {template} doesn't match any files.")
 
-    # Get the templete format
-    index = slice(*template_string_number_index(template))
-
     # Get the first and last indices
     if "#" in template:
-        first = int(filenames[0][index])
-        last = int(filenames[-1][index])
+        i0, i1 = template_string_number_index(template)
+        prefix = template[:i0]
+        suffix = template[i1:]
+        first = int(filenames[0].replace(prefix, "").replace(suffix, ""))
+        last = int(filenames[-1].replace(prefix, "").replace(suffix, ""))
     else:  # template is one file
         first, last = 0, 0
 

diff --git a/tests/test_sequence_filenames.py b/tests/test_sequence_filenames.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
+import shutil
+
 import pytest
 
-from dxtbx.sequence_filenames import template_regex
+from dxtbx.sequence_filenames import template_image_range, template_regex
 
 
 @pytest.mark.parametrize(
@@ -15,7 +17,26 @@
         ("foo_bar_002.img1000", "foo_bar_###.img1000", 2),
         ("foo_bar_00005.img", "foo_bar_#####.img", 5),
         ("image0010", "image####", 10),
+        ("foo_123_1_1.rodhypix", "foo_123_1_#.rodhypix", 1),  # Rigaku-style
     ],
 )
 def test_template_regex(filename, template, digits):
     assert template_regex(filename) == (template, digits)
+
+
+def test_template_image_range(dials_data):
+    template = str(dials_data("insulin", pathlib=True) / "insulin_1_###.img")
+    assert template_image_range(template) == (1, 45)
+
+
+def test_template_image_range_non_zero_padded(dials_data, tmp_path):
+    images = sorted(dials_data("insulin", pathlib=True).glob("insulin_1_0[0-1]*"))
+    # symlink if possible, copy if necessary
+    for i, image in enumerate(images):
+        try:
+            (tmp_path / f"insulin_1_{i + 1}.img").symlink_to(image)
+        except OSError:
+            shutil.copy(image, (tmp_path / f"insulin_1_{i + 1}.img"))
+
+    template = str(tmp_path / "insulin_1_#.img")
+    assert template_image_range(template) == (1, 19)