Skip to content

Commit

Permalink
Merge pull request #1166 from kermitt2/fix-affiliation-dl
Browse files Browse the repository at this point in the history
Fix affiliation missing when using DL affiliation-address model
  • Loading branch information
lfoppiano authored Sep 18, 2024
2 parents dc91aa4 + f97efa8 commit f501033
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 37 deletions.
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
package org.grobid.core.engines;

import org.chasen.crfpp.Tagger;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.GrobidModel;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.Affiliation;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeaturesVectorAffiliationAddress;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.UnicodeUtil;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;

import java.util.ArrayList;
import java.util.List;
Expand All @@ -24,8 +25,12 @@
public class AffiliationAddressParser extends AbstractParser {
public Lexicon lexicon = Lexicon.getInstance();

protected AffiliationAddressParser(GrobidModel model) {
super(model);
}

public AffiliationAddressParser() {
super(GrobidModels.AFFILIATION_ADDRESS);
this(GrobidModels.AFFILIATION_ADDRESS);
}

public List<Affiliation> processing(String input) {
Expand Down Expand Up @@ -78,22 +83,26 @@ protected static List<String> getAffiliationBlocks(List<LayoutToken> tokenizatio
return affiliationBlocks;
}

/**
* Separate affiliation blocks, when they appears to be in separate set of offsets.
*/
protected static List<String> getAffiliationBlocksFromSegments(List<List<LayoutToken>> tokenizations) {
ArrayList<String> affiliationBlocks = new ArrayList<String>();
ArrayList<String> affiliationBlocks = new ArrayList<>();
int end = 0;
for(List<LayoutToken> tokenizationSegment : tokenizations) {
if (tokenizationSegment == null || tokenizationSegment.size() == 0)
if (CollectionUtils.isEmpty(tokenizationSegment))
continue;

// if we have an offset shit, we introduce a segmentation of the affiliation block
LayoutToken startToken = tokenizationSegment.get(0);
int start = startToken.getOffset();
if (start-end > 2)
if (start-end > 2 && end > 0)
affiliationBlocks.add("\n");

for(LayoutToken tok : tokenizationSegment) {
if (tok.getText().length() == 0)
if (StringUtils.isEmpty(tok.getText())) {
continue;
}

if (!tok.getText().equals(" ")) {
if (tok.getText().equals("\n")) {
Expand Down Expand Up @@ -123,11 +132,11 @@ public List<Affiliation> processingLayoutTokens(List<List<LayoutToken>> tokeniza

//System.out.println(affiliationBlocks.toString());

List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>();
List<List<OffsetPosition>> placesPositions = new ArrayList<>();
List<List<OffsetPosition>> countriesPositions = new ArrayList<>();
placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation));
countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation));
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
List<List<LayoutToken>> allTokens = new ArrayList<>();
allTokens.add(tokenizationsAffiliation);
String affiliationSequenceWithFeatures =
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,28 @@
package org.grobid.core.engines;

import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.AfterClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.nullValue;
import static org.junit.Assert.assertThat;
import static org.hamcrest.CoreMatchers.is;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import com.google.common.base.Joiner;

import org.grobid.core.GrobidModels;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.data.Affiliation;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.features.FeaturesVectorAffiliationAddress;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.main.LibraryLoader;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.OffsetPosition;
import org.junit.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.*;
import static org.junit.Assert.assertThat;

public class AffiliationAddressParserTest {

Expand All @@ -43,13 +38,13 @@ public class AffiliationAddressParserTest {

@Before
public void setUp() throws Exception {
this.target = new AffiliationAddressParser();
this.target = new AffiliationAddressParser(GrobidModels.DUMMY);
this.analyzer = GrobidAnalyzer.getInstance();
}

@BeforeClass
public static void init() {
LibraryLoader.load();
// LibraryLoader.load();
GrobidProperties.getInstance();
}

Expand Down Expand Up @@ -257,4 +252,109 @@ public void shouldExtractMultipleAffiliations() throws Exception {
is("University of Madness")
);
}

@Test
@Ignore("This test is used to show the failing input data")
public void testResultExtractionLayoutTokensFromDLOutput() throws Exception {
String result = "\n" +
"\n" +
"Department\tdepartment\tD\tDe\tDep\tDepa\tt\tnt\tent\tment\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<department>\n" +
"of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t1\t0\tNOPUNCT\txx\t<affiliation>\t<department>\n" +
"Radiation\tradiation\tR\tRa\tRad\tRadi\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<department>\n" +
"Oncology\toncology\tO\tOn\tOnc\tOnco\ty\tgy\togy\tlogy\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<department>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"San\tsan\tS\tSa\tSan\tSan\tn\tan\tSan\tSan\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXxx\t<affiliation>\tI-<institution>\n" +
"Camillo\tcamillo\tC\tCa\tCam\tCami\to\tlo\tllo\tillo\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t<affiliation>\t<institution>\n" +
"Forlanini\tforlanini\tF\tFo\tFor\tForl\ti\tni\tini\tnini\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
"Hospital\thospital\tH\tHo\tHos\tHosp\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"Circonvallazione\tcirconvallazione\tC\tCi\tCir\tCirc\te\tne\tone\tione\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<addrLine>\n" +
"Gianicolense\tgianicolense\tG\tGi\tGia\tGian\te\tse\tnse\tense\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<addrLine>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"87\t87\t8\t87\t87\t87\t7\t87\t87\t87\tLINESTART\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdd\t<affiliation>\tI-<addrLine>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t<affiliation>\t<addrLine>\n" +
"00152\t00152\t0\t00\t001\t0015\t2\t52\t152\t0152\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdddd\t<affiliation>\t<addrLine>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"Rome\trome\tR\tRo\tRom\tRome\te\tme\tome\tRome\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\t0\t1\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<settlement>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t1\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"Italy\titaly\tI\tIt\tIta\tItal\ty\tly\taly\ttaly\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t1\tNOPUNCT\tXxxx\t<affiliation>\tI-<country>\n" +
";\t;\t;\t;\t;\t;\t;\t;\t;\t;\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tPUNCT\t;\t<affiliation>\t<country>\n";

List<LayoutToken> tokenizations = Arrays.stream(result.split("\n"))
.map(row -> new LayoutToken(row.split("\t")[0]))
.collect(Collectors.toList());

assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0)));
}


@Test
public void testResultExtractionLayoutTokensFromCRFOutput() throws Exception {
String result = "MD\tmd\tM\tMD\tMD\tMD\tD\tMD\tMD\tMD\tLINESTART\tALLCAPS\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXX\t<affiliation>\tI-<institution>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"Department\tdepartment\tD\tDe\tDep\tDepa\tt\tnt\tent\tment\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<department>\n" +
"of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t1\t0\tNOPUNCT\txx\t<affiliation>\t<department>\n" +
"Radiation\tradiation\tR\tRa\tRad\tRadi\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<department>\n" +
"Oncology\toncology\tO\tOn\tOnc\tOnco\ty\tgy\togy\tlogy\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<department>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"San\tsan\tS\tSa\tSan\tSan\tn\tan\tSan\tSan\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXxx\t<affiliation>\tI-<institution>\n" +
"Camillo\tcamillo\tC\tCa\tCam\tCami\to\tlo\tllo\tillo\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t<affiliation>\t<institution>\n" +
"Forlanini\tforlanini\tF\tFo\tFor\tForl\ti\tni\tini\tnini\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
"Hospital\thospital\tH\tHo\tHos\tHosp\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"Circonvallazione\tcirconvallazione\tC\tCi\tCir\tCirc\te\tne\tone\tione\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<addrLine>\n" +
"Gianicolense\tgianicolense\tG\tGi\tGia\tGian\te\tse\tnse\tense\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<addrLine>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"87\t87\t8\t87\t87\t87\t7\t87\t87\t87\tLINESTART\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdd\t<affiliation>\tI-<postCode>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t<affiliation>\t<postCode>\n" +
"00152\t00152\t0\t00\t001\t0015\t2\t52\t152\t0152\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdddd\t<affiliation>\t<postCode>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"Rome\trome\tR\tRo\tRom\tRome\te\tme\tome\tRome\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\t0\t1\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<settlement>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t1\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"Italy\titaly\tI\tIt\tIta\tItal\ty\tly\taly\ttaly\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t1\tNOPUNCT\tXxxx\t<affiliation>\tI-<country>\n" +
";\t;\t;\t;\t;\t;\t;\t;\t;\t;\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tPUNCT\t;\t<affiliation>\t<country>";

List<LayoutToken> tokenizations = Arrays.stream(result.split("\n"))
.map(row -> new LayoutToken(row.split("\t")[0]))
.collect(Collectors.toList());

assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0)));
}

@Test
public void testGetAffiliationBlocksFromSegments_1() throws Exception {
String block1 = "Department of science, University of Science, University of Madness";
List<LayoutToken> tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1);
tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100));

String block2 = "Department of mental health, University of happyness, Italy";
List<LayoutToken> tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2);
tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 500));

List<String> affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2));

assertThat(affiliationBlocksFromSegments, hasSize(22));
assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n"))));
assertThat(affiliationBlocksFromSegments.get(11), is("\n"));
}

@Test
public void testGetAffiliationBlocksFromSegments_2() throws Exception {
String block1 = "Department of science, University of Science, University of Madness";
List<LayoutToken> tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1);
tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100));

String block2 = "Department of mental health, University of happyness, Italy";
List<LayoutToken> tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2);
tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 100 + tokBlock1.size()));

List<String> affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2));

assertThat(affiliationBlocksFromSegments, hasSize(21));
assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n"))));
assertThat(affiliationBlocksFromSegments.get(11), is(not("@newline")));

}
}

0 comments on commit f501033

Please sign in to comment.