Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unsynchronize ICU4j transliterate #246

Merged
merged 7 commits into from
Jun 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package com.onthegomap.planetiler.util;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Objects;
import java.util.function.Predicate;

/**
* A class loader that loads certain classes even if they are already loaded by a parent classloader.
*
* This makes it possible to get multiple copies of the same class, so for example you could invoke a method
* synchronized on a static variable from different classes without contention.
*/
public class DuplicateClassLoader extends ClassLoader {

private final Predicate<String> shouldDuplicate;

private DuplicateClassLoader(Predicate<String> shouldDuplicate) {
this.shouldDuplicate = shouldDuplicate;
}

/**
* Returns a {@link ClassLoader} that loads every class with a name starting with {@code prefix} even if the parent
* has already loaded it.
*/
public static DuplicateClassLoader duplicateClassesWithPrefix(String prefix) {
return new DuplicateClassLoader(name -> name.startsWith(prefix));
}

@Override
public Class<?> loadClass(String name) throws ClassNotFoundException {
if (shouldDuplicate.test(name)) {
Class<?> c = findLoadedClass(name);
if (c == null) {
byte[] b = loadClassFromFile(name);
return defineClass(name, b, 0, b.length);
}
}
return super.loadClass(name);
}

private byte[] loadClassFromFile(String fileName) {
String classFileName = fileName.replace('.', '/') + ".class";
try (var inputStream = getClass().getClassLoader().getResourceAsStream(classFileName)) {
return Objects.requireNonNull(inputStream, "Could not load " + fileName + " (" + classFileName + ")")
.readAllBytes();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package com.onthegomap.planetiler.util;

import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;

/**
* A {@link com.ibm.icu.text.Transliterator} that does not share any static data with other thread local
* transliterators.
* <p>
* By default, {@link com.ibm.icu.text.Transliterator} synchronizes on static data during transliteration, which results
* in contention between threads when transliterating many strings in parallel. Separate instances of this class can be
* used across different threads in order to transliterate without contention.
*/
public class ThreadLocalTransliterator {
private final ClassLoader classLoader = DuplicateClassLoader.duplicateClassesWithPrefix("com.ibm.icu");

/**
* Returns a {@link com.ibm.icu.text.Transliterator} for {@code id} that does not share any data with transliterators
* on other threads.
*/
public TransliteratorInstance getInstance(String id) {
try {
Class<?> cls = classLoader.loadClass("com.ibm.icu.text.Transliterator");
Method getInstance = cls.getMethod("getInstance", String.class);
Object t = getInstance.invoke(null, id);
Method transform = cls.getMethod("transliterate", String.class);
return str -> {
try {
return (String) transform.invoke(t, str);
} catch (IllegalAccessException | InvocationTargetException e) {
throw new IllegalStateException(e);
}
};
} catch (ClassNotFoundException | InvocationTargetException | NoSuchMethodException | IllegalAccessException e) {
throw new IllegalStateException(e);
}
}

@FunctionalInterface
public interface TransliteratorInstance {
String transliterate(String input);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package com.onthegomap.planetiler.util;

import com.ibm.icu.text.Transliterator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
Expand All @@ -16,6 +15,11 @@
* wikidata} tag on a source feature points to.
*/
public class Translations {
// Ignore warnings about not removing thread local values since planetiler uses dedicated worker threads that release
// values when a task is finished and are not re-used.
@SuppressWarnings("java:S5164")
private static final ThreadLocal<ThreadLocalTransliterator.TransliteratorInstance> TRANSLITERATOR =
ThreadLocal.withInitial(() -> new ThreadLocalTransliterator().getInstance("Any-Latin"));

private boolean shouldTransliterate = true;
private final Set<String> languageSet;
Expand Down Expand Up @@ -111,28 +115,23 @@ private static class OsmTranslationProvider implements TranslationProvider {
@Override
public Map<String, String> getNameTranslations(Map<String, Object> tags) {
Map<String, String> result = new HashMap<>();
for (String key : tags.keySet()) {
if (key.startsWith("name:")) {
Object value = tags.get(key);
if (value instanceof String stringVal) {
result.put(key, stringVal);
}
for (var entry : tags.entrySet()) {
String key = entry.getKey();
if (key.startsWith("name:") && entry.getValue()instanceof String stringVal) {
result.put(key, stringVal);
}
}
return result;
}
}

private static final Transliterator TO_LATIN_TRANSLITERATOR = Transliterator.getInstance("Any-Latin");

/**
* Attempts to translate non-latin characters to latin characters that preserve the <em>sound</em> of the word (as
* opposed to translation which attempts to preserve meaning) using ICU4j.
* <p>
* NOTE: This can be expensive and transliteration is synchronized deep down in ICU4j internals which limits benefit
* of running in multiple threads, so exhaust all other options first.
* NOTE: This can be expensive and the quality is hit or miss, so exhaust all other options first.
*/
public static String transliterate(String input) {
return input == null ? null : TO_LATIN_TRANSLITERATOR.transform(input);
return input == null ? null : TRANSLITERATOR.get().transliterate(input);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.onthegomap.planetiler.util;

import static org.junit.jupiter.api.Assertions.assertNotSame;
import static org.junit.jupiter.api.Assertions.assertSame;

import org.junit.jupiter.api.Test;

class DuplicateClassLoaderTest {

@Test
void testDuplicateClassLoader() throws Exception {
var cl1 = DuplicateClassLoader.duplicateClassesWithPrefix("com.onthegomap.planetiler.util");
var cl2 = DuplicateClassLoader.duplicateClassesWithPrefix("com.onthegomap.planetiler.util");
var tc1 = cl1.loadClass("com.onthegomap.planetiler.util.TestClass");
var tc2 = cl2.loadClass("com.onthegomap.planetiler.util.TestClass");
assertSame(tc1, cl1.loadClass("com.onthegomap.planetiler.util.TestClass"));
assertNotSame(tc1, tc2);
tc1.getConstructor().newInstance();
tc2.getConstructor().newInstance();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package com.onthegomap.planetiler.util;

public class TestClass {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.onthegomap.planetiler.util;

import static org.junit.jupiter.api.Assertions.assertEquals;

import org.junit.jupiter.api.Test;

class ThreadLocalTransliteratorTest {
@Test
void test() {
var t1 = new ThreadLocalTransliterator().getInstance("Any-Latin");
var t2 = new ThreadLocalTransliterator().getInstance("Any-Latin");
assertEquals("rì běn", t1.transliterate("日本"));
assertEquals("rì běn", t2.transliterate("日本"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,9 @@ void testTwoProviders() {
assertEquals(Map.of("name:en", "en2", "name:es", "es1", "name:de", "de2"),
translations.getTranslations(Map.of("name:en", "en1", "name:es", "es1")));
}

@Test
void testTransliterate() {
assertEquals("rì běn", Translations.transliterate("日本"));
}
}