Skip to content

Commit

Permalink
Handle image templates that don't use zero padding (cctbx#705)
Browse files Browse the repository at this point in the history
* Use a single `#` character in a filename template to represent a non-zero-padded
incremental number.
  • Loading branch information
dagewa authored Apr 18, 2024
1 parent dd77a69 commit 24a50c7
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 18 deletions.
10 changes: 10 additions & 0 deletions newsfragments/705.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
The template handling mechanism is extended so that a template with a
single ``#`` is expanded to match non-zero padded sequential numbers.
For example, ``image_#.cbf`` will match ``image_1.cbf``, ``image_2.cbf``,
..., ``image_10.cbf`` and so on.

Using a single ``#`` to match up to 10 images _within_ a zero-padded
sequence continues to work as before. For example,
``dials.import template=insulin_1_01#.img`` will match the files
``insulin_1_010.img``, ``insulin_1_011.img``, ..., ``insulin_1_019.img``,
and no others.
22 changes: 16 additions & 6 deletions src/dxtbx/imageset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

import natsort

import boost_adaptbx.boost.python

import dxtbx.format.image # noqa: F401, import dependency for unpickling
Expand Down Expand Up @@ -45,7 +47,9 @@
)


def _expand_template(template: str, indices: Iterable[int]) -> list[str]:
def _expand_template_to_sorted_filenames(
template: str, indices: Iterable[int]
) -> list[str]:
"""Expand a template string to a list of filenames.
Args:
Expand All @@ -55,7 +59,13 @@ def _expand_template(template: str, indices: Iterable[int]) -> list[str]:
pfx = template.split("#")[0]
sfx = template.split("#")[-1]
count = template.count("#")
return [f"{pfx}{index:0{count}}{sfx}" for index in indices]
if count == 1:
# Special handling for a template with a single "#", which does not
# assume a zero-padded index.
filenames = [f"{pfx}{index}{sfx}" for index in indices]
else:
filenames = [f"{pfx}{index:0{count}}{sfx}" for index in indices]
return natsort.natsorted(filenames)


class MemReader:
Expand Down Expand Up @@ -449,7 +459,7 @@ def from_template(

# Set the image range
indices = range(image_range[0], image_range[1] + 1)
filenames = _expand_template(template, indices)
filenames = _expand_template_to_sorted_filenames(template, indices)
else:
if "master" not in template:
raise ValueError("Invalid template")
Expand Down Expand Up @@ -486,7 +496,7 @@ def _create_imageset(filelist, check_headers):

# Get the template format
if "#" in template:
filenames = sorted(_expand_template(template, indices))
filenames = _expand_template_to_sorted_filenames(template, indices)
else:
filenames = [template]

Expand All @@ -503,7 +513,7 @@ def _create_sequence(filelist, check_headers):

# Expand the template if necessary
if "#" in template:
filenames = sorted(_expand_template(template, indices))
filenames = _expand_template_to_sorted_filenames(template, indices)
else:
filenames = [template]

Expand Down Expand Up @@ -564,7 +574,7 @@ def make_sequence(

# Get the template format
if "#" in template:
filenames = sorted(_expand_template(template, indices))
filenames = _expand_template_to_sorted_filenames(template, indices)
else:
filenames = [template]

Expand Down
10 changes: 7 additions & 3 deletions src/dxtbx/model/experiment_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,8 +915,6 @@ def from_templates(templates, **kwargs):
f"Image file {filenames[0]} appears to be a '{type(format_class).__name__}', but this is an abstract Format"
)
else:
index = slice(*template_string_number_index(template))

image_range = kwargs.get("image_range")
if image_range:
first, last = image_range
Expand All @@ -926,7 +924,13 @@ def from_templates(templates, **kwargs):
if not kwargs.get("allow_incomplete_sequences", False):
if "#" in template:
# Check all images in range are present - if allowed
all_numbers = {int(f[index]) for f in filenames}
i0, i1 = template_string_number_index(template)
prefix = template[:i0]
suffix = template[i1:]
all_numbers = {
int(f.replace(prefix, "").replace(suffix, ""))
for f in filenames
}
missing = set(range(first, last + 1)) - all_numbers
if missing:
raise ValueError(
Expand Down
7 changes: 6 additions & 1 deletion src/dxtbx/model/scan_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
r"([0-9]{2,12})\.(.*)",
r"(.*)\.([0-9]{2,12})_(.*)",
r"(.*)\.([0-9]{2,12})(.*)",
r"(.*)\.([0-9]{1,12})([^0]*)_(.*)",
r"(.*)\.([0-9]{1})(.*)",
]

joiners = [".", "_", "", ""]
joiners = [".", "_", "", "_", ""]

compiled_patterns = [re.compile(pattern) for pattern in patterns]

Expand All @@ -38,6 +39,10 @@ def template_regex(filename):
exten = "." + groups[0][::-1]
digits = groups[1][::-1]
prefix = groups[2][::-1] + joiners[j]
elif len(groups) == 4:
exten = "." + groups[0][::-1]
digits = groups[1][::-1]
prefix = groups[3][::-1] + joiners[j] + groups[2][::-1]
else:
exten = ""
digits = groups[0][::-1]
Expand Down
26 changes: 19 additions & 7 deletions src/dxtbx/sequence_filenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from collections import defaultdict
from glob import glob

import natsort


def template_regex(filename):
"""Works out a template from a filename.
Expand Down Expand Up @@ -181,6 +183,9 @@ def replace_template_format_with_hash(match):

def template_string_to_glob_expr(template):
"""Convert the template to a glob expression."""
if template.count("#") == 1:
# https://github.com/cctbx/dxtbx/issues/646
return template.replace("#", "*")
return template.replace("#", "[0-9]")


Expand All @@ -191,27 +196,34 @@ def template_string_number_index(template):

def locate_files_matching_template_string(template):
"""Return all files matching template."""
return glob(template_string_to_glob_expr(template))
matches = glob(template_string_to_glob_expr(template))
if template.count("#") != 1:
return matches
matches = [os.path.split(p) for p in matches]
i0, i1 = template_string_number_index(template)
suffix = template[i1:]
patt = re.compile("([^0]*)([0-9]+)" + suffix)
return [os.path.join(*m) for m in matches if patt.match(m[1])]


def template_image_range(template):
"""Return the image range of files with this template."""

# Find the files matching the template
filenames = locate_files_matching_template_string(template)
filenames = sorted(filenames)
filenames = natsort.natsorted(filenames)

# Check that the template matches some files
if len(filenames) == 0:
raise ValueError(f"Template {template} doesn't match any files.")

# Get the templete format
index = slice(*template_string_number_index(template))

# Get the first and last indices
if "#" in template:
first = int(filenames[0][index])
last = int(filenames[-1][index])
i0, i1 = template_string_number_index(template)
prefix = template[:i0]
suffix = template[i1:]
first = int(filenames[0].replace(prefix, "").replace(suffix, ""))
last = int(filenames[-1].replace(prefix, "").replace(suffix, ""))
else: # template is one file
first, last = 0, 0

Expand Down
23 changes: 22 additions & 1 deletion tests/test_sequence_filenames.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

import shutil

import pytest

from dxtbx.sequence_filenames import template_regex
from dxtbx.sequence_filenames import template_image_range, template_regex


@pytest.mark.parametrize(
Expand All @@ -15,7 +17,26 @@
("foo_bar_002.img1000", "foo_bar_###.img1000", 2),
("foo_bar_00005.img", "foo_bar_#####.img", 5),
("image0010", "image####", 10),
("foo_123_1_1.rodhypix", "foo_123_1_#.rodhypix", 1), # Rigaku-style
],
)
def test_template_regex(filename, template, digits):
assert template_regex(filename) == (template, digits)


def test_template_image_range(dials_data):
template = str(dials_data("insulin", pathlib=True) / "insulin_1_###.img")
assert template_image_range(template) == (1, 45)


def test_template_image_range_non_zero_padded(dials_data, tmp_path):
images = sorted(dials_data("insulin", pathlib=True).glob("insulin_1_0[0-1]*"))
# symlink if possible, copy if necessary
for i, image in enumerate(images):
try:
(tmp_path / f"insulin_1_{i + 1}.img").symlink_to(image)
except OSError:
shutil.copy(image, (tmp_path / f"insulin_1_{i + 1}.img"))

template = str(tmp_path / "insulin_1_#.img")
assert template_image_range(template) == (1, 19)

0 comments on commit 24a50c7

Please sign in to comment.