Skip to content

Commit

Permalink
Merge pull request #317 from lasztoth/langid-language-analyser
Browse files Browse the repository at this point in the history
Changed LanguageAnalyser to langid
  • Loading branch information
GilHoggarth authored Aug 9, 2024
2 parents 13595be + 82c7a72 commit 61e070a
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 26 deletions.
7 changes: 7 additions & 0 deletions warc-indexer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -320,5 +320,12 @@
<artifactId>lucene-core</artifactId>
<version>8.7.0</version>
</dependency>

<dependency>
<groupId>com.carrotsearch</groupId>
<artifactId>langid-java</artifactId>
<version>1.1.0-SNAPSHOT</version>
</dependency>

</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
*/
package uk.bl.wa.analyser.text;

/*
/*-
* #%L
* warc-indexer
* %%
* Copyright (C) 2013 - 2023 The webarchive-discovery project contributors
* Copyright (C) 2013 - 2024 The webarchive-discovery project contributors
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
Expand All @@ -25,56 +25,62 @@
* #L%
*/


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;

import com.carrotsearch.labs.langid.DetectedLanguage;
import com.carrotsearch.labs.langid.LangIdV3;
import com.typesafe.config.Config;

import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.util.Instrument;

/**
* @author anj
* @author Toth
*
*/
public class LanguageAnalyser extends AbstractTextAnalyser {
public class LanguageAnalyser extends AbstractTextAnalyser
{
private Logger log = LoggerFactory.getLogger(LanguageAnalyser.class);

/** */
private LanguageDetector ld;
// The language detection model
private LangIdV3 langid;

/**
* @param conf
*/
public void configure(Config conf) {
public void configure(Config conf)
{
setEnabled(!conf.hasPath("warc.index.extract.content.language.enabled")
|| conf.getBoolean(
"warc.index.extract.content.language.enabled"));
ld = new OptimaizeLangDetector().loadModels();
log.info(
"Constructed language analyzer with enabled = " + isEnabled());
|| conf.getBoolean("warc.index.extract.content.language.enabled"));

this.langid = new LangIdV3();

log.debug("Constructed language analyzer with enabled = " + isEnabled());
}

/* (non-Javadoc)
* @see uk.bl.wa.analyser.text.TextAnalyser#analyse(java.lang.String, uk.bl.wa.util.solr.SolrRecord)
*/
@Override
public void analyse(String text, SolrRecord solr) {
public void analyse(String text, SolrRecord solr)
{
final long start = System.nanoTime();
try {
LanguageResult li = ld.detect(text);
if (li != null) {
solr.addField(SolrFields.CONTENT_LANGUAGE, li.getLanguage());

try
{
DetectedLanguage result = langid.classify(text, true);

if (result != null)
{
solr.addField(SolrFields.CONTENT_LANGUAGE, result.getLangCode());
}
} catch (IllegalArgumentException e) {
log.error("Exception when determining language of this item: "
+ e.getMessage(), e);
}
catch (IllegalArgumentException e)
{
log.error("Exception when determining language of this item: " + e.getMessage(), e);
solr.addParseException(e);
}

Instrument.timeRel("TextAnalyzers#total", "LanguageAnalyzer#total", start);
}

Expand Down

0 comments on commit 61e070a

Please sign in to comment.