Skip to content

Commit

Permalink
add more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Sep 17, 2024
1 parent bd93a61 commit f97efa8
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 25 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.grobid.core.engines;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.GrobidModel;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.Affiliation;
Expand Down Expand Up @@ -81,11 +83,14 @@ protected static List<String> getAffiliationBlocks(List<LayoutToken> tokenizatio
return affiliationBlocks;
}

/**
* Separate affiliation blocks, when they appears to be in separate set of offsets.
*/
protected static List<String> getAffiliationBlocksFromSegments(List<List<LayoutToken>> tokenizations) {
ArrayList<String> affiliationBlocks = new ArrayList<String>();
ArrayList<String> affiliationBlocks = new ArrayList<>();
int end = 0;
for(List<LayoutToken> tokenizationSegment : tokenizations) {
if (tokenizationSegment == null || tokenizationSegment.size() == 0)
if (CollectionUtils.isEmpty(tokenizationSegment))
continue;

// if we have an offset shit, we introduce a segmentation of the affiliation block
Expand All @@ -95,8 +100,9 @@ protected static List<String> getAffiliationBlocksFromSegments(List<List<LayoutT
affiliationBlocks.add("\n");

for(LayoutToken tok : tokenizationSegment) {
if (tok.getText().length() == 0)
if (StringUtils.isEmpty(tok.getText())) {
continue;
}

if (!tok.getText().equals(" ")) {
if (tok.getText().equals("\n")) {
Expand Down Expand Up @@ -126,11 +132,11 @@ public List<Affiliation> processingLayoutTokens(List<List<LayoutToken>> tokeniza

//System.out.println(affiliationBlocks.toString());

List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>();
List<List<OffsetPosition>> placesPositions = new ArrayList<>();
List<List<OffsetPosition>> countriesPositions = new ArrayList<>();
placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation));
countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation));
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
List<List<LayoutToken>> allTokens = new ArrayList<>();
allTokens.add(tokenizationsAffiliation);
String affiliationSequenceWithFeatures =
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,34 +1,28 @@
package org.grobid.core.engines;

import com.google.common.base.Joiner;
import org.grobid.core.GrobidModels;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.AfterClass;
import org.junit.Test;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.data.Affiliation;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.features.FeaturesVectorAffiliationAddress;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.OffsetPosition;
import org.junit.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.hamcrest.Matchers.*;
import static org.junit.Assert.assertThat;
import static org.hamcrest.CoreMatchers.is;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

import com.google.common.base.Joiner;

import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.data.Affiliation;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.features.FeaturesVectorAffiliationAddress;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.main.LibraryLoader;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.LayoutTokensUtil;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.*;
import static org.junit.Assert.assertThat;

public class AffiliationAddressParserTest {

Expand Down Expand Up @@ -260,6 +254,7 @@ public void shouldExtractMultipleAffiliations() throws Exception {
}

@Test
@Ignore("This test is used to show the failing input data")
public void testResultExtractionLayoutTokensFromDLOutput() throws Exception {
String result = "\n" +
"\n" +
Expand Down Expand Up @@ -327,4 +322,39 @@ public void testResultExtractionLayoutTokensFromCRFOutput() throws Exception {

assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0)));
}

@Test
public void testGetAffiliationBlocksFromSegments_1() throws Exception {
String block1 = "Department of science, University of Science, University of Madness";
List<LayoutToken> tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1);
tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100));

String block2 = "Department of mental health, University of happyness, Italy";
List<LayoutToken> tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2);
tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 500));

List<String> affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2));

assertThat(affiliationBlocksFromSegments, hasSize(22));
assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n"))));
assertThat(affiliationBlocksFromSegments.get(11), is("\n"));
}

@Test
public void testGetAffiliationBlocksFromSegments_2() throws Exception {
String block1 = "Department of science, University of Science, University of Madness";
List<LayoutToken> tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1);
tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100));

String block2 = "Department of mental health, University of happyness, Italy";
List<LayoutToken> tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2);
tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 100 + tokBlock1.size()));

List<String> affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2));

assertThat(affiliationBlocksFromSegments, hasSize(21));
assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n"))));
assertThat(affiliationBlocksFromSegments.get(11), is(not("@newline")));

}
}

0 comments on commit f97efa8

Please sign in to comment.