Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvement of the recovery of Pragmatic Segmenter sentence segmentation text wrt to the original text offsets #701

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,6 @@ project("grobid-core") {
implementation 'black.ninia:jep:4.0.2'
implementation 'org.apache.opennlp:opennlp-tools:1.9.1'
implementation group: 'org.jruby', name: 'jruby-complete', version: '9.2.13.0'

shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public interface SentenceDetector {
* @return a list of offset positions indicating start and end character
* position of the recognized sentence in the text
*/
public List<OffsetPosition> detect(String text);
List<OffsetPosition> detect(String text);


/**
Expand All @@ -24,5 +24,5 @@ public interface SentenceDetector {
* @return a list of offset positions indicating start and end character
* position of the recognized sentence in the text
*/
public List<OffsetPosition> detect(String text, Language lang);
List<OffsetPosition> detect(String text, Language lang);
}
Original file line number Diff line number Diff line change
@@ -1,39 +1,43 @@
package org.grobid.core.lang.impl;

import org.jruby.embed.PathType;
import org.jruby.embed.ScriptingContainer;
import org.jruby.embed.LocalContextScope;
import org.jruby.embed.LocalVariableBehavior;

import org.grobid.core.lang.SentenceDetector;
import com.google.common.base.Joiner;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.grobid.core.lang.Language;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.lang.SentenceDetector;
import org.grobid.core.utilities.GrobidProperties;

import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.matching.DiffMatchPatch;
import org.jruby.embed.LocalContextScope;
import org.jruby.embed.LocalVariableBehavior;
import org.jruby.embed.PathType;
import org.jruby.embed.ScriptingContainer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.io.*;
import java.util.stream.Collectors;

/**
* Implementation of sentence segmentation via the Pragmatic Segmenter
*
*
*/
public class PragmaticSentenceDetector implements SentenceDetector {
private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetector.class);

private ScriptingContainer instance = null;

public PragmaticSentenceDetector() {
String segmenterRbFile = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" +
String segmenterRbFile = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" +
File.separator + "pragmatic_segmenter"+ File.separator + "segmenter.rb";
String segmenterLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation";
/*String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" +
File.separator + "pragmatic_segmenter" + File.separator + "gem" + File.separator + "gems" +
File.separator + "unicode-0.4.4.4-java" + File.separator + "lib";*/
String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" +
String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" +
File.separator + "pragmatic_segmenter" + File.separator + "lib";
//System.out.println(vendorLoadPath);

Expand All @@ -49,7 +53,7 @@ public PragmaticSentenceDetector() {

@Override
public List<OffsetPosition> detect(String text) {
return detect(text, new Language(Language.EN));
return detect(text, new Language(Language.EN));
}

@Override
Expand All @@ -65,14 +69,152 @@ public List<OffsetPosition> detect(String text, Language lang) {
//System.out.println(text);
//System.out.println(ret.toString());

List<String> retList = (List<String>) ret;

List<OffsetPosition> result = getSentenceOffsets(text, retList);

return result;
}

public static Pair<String, Integer> findInText(String subString, String text) {

LinkedList<DiffMatchPatch.Diff> diffs = new DiffMatchPatch().diff_main(text, subString);
List<String> list = new ArrayList<>();

// Transform to a char based sequence
diffs.stream().forEach(d -> {
String text_chunk = d.text;
DiffMatchPatch.Operation operation = d.operation;
String op = " ";
if (operation.equals(DiffMatchPatch.Operation.INSERT)) {
op = "+";
} else if (operation.equals(DiffMatchPatch.Operation.DELETE)) {
op = "-";
}

for (int i = 0; i < text_chunk.toCharArray().length; i++) {
String sb = op + " " + text_chunk.toCharArray()[i];
list.add(sb);
}
});

List<String> list_cleaned = list.stream().filter(d -> d.charAt(0) != '+').collect(Collectors.toList());
// System.out.println(list_cleaned);

boolean inside = false;
List<String> output = new ArrayList<>();
for (int i = 0; i < list_cleaned.size(); i++) {
String item = list_cleaned.get(i);
if (item.charAt(0) == '-' && !inside) {
continue;
} else {
inside = true;
output.add(String.valueOf(text.charAt(i)));
}
}

for (int i = output.size() - 1; i > -1; i--) {
String item = list_cleaned.get(i);
if (item.charAt(0) == '-' || item.charAt(0) == '+') {
output.remove(i);
} else {
break;
}
}
String adaptedSubString = Joiner.on("").join(output);
int start = text.indexOf(adaptedSubString);

return Pair.of(adaptedSubString, start);
}


protected static List<OffsetPosition> getSentenceOffsets(String text, List<String> retList) {
// build offset positions from the string chunks
List<OffsetPosition> result = new ArrayList<>();

int previousEnd = -1;
int previousStart = -1;

for (int i = 0; i < retList.size(); i++) {
String sentence = retList.get(i);
String sentenceClean = StringUtils.strip(sentence, "\n");

int start = -1;
int end = -1;

if (previousEnd > -1) {
String subString = StringUtils.substring(text, previousEnd, previousEnd + 2 * sentenceClean.length());
int relativeIndexOf = subString.indexOf(sentenceClean);
start = relativeIndexOf > -1 ? relativeIndexOf + previousEnd : relativeIndexOf;
} else {
start = text.indexOf(sentenceClean);
}


String outputStr = "";
if (start == -1) {
if (previousEnd > -1) {
String subString = StringUtils.substring(text, previousEnd, previousEnd + 2 * sentenceClean.length());
int relativeIndexOf = subString.replace("\n", " ").indexOf(sentenceClean);
start = relativeIndexOf > 1 ? relativeIndexOf + previousEnd : relativeIndexOf;
} else {
start = text.replace("\n", " ").indexOf(sentenceClean);
}

if (start == -1) {

String textAdapted = text;

if (previousEnd > -1) {
textAdapted = StringUtils.substring(text, previousEnd, previousEnd + 2 * sentenceClean.length());
Pair<String, Integer> inText = findInText(sentenceClean, textAdapted);
start = inText.getRight();
outputStr = inText.getLeft();
start += previousEnd;
} else if (previousStart > -1) {
textAdapted = StringUtils.substring(text, previousStart, previousStart + 2 * sentenceClean.length());
Pair<String, Integer> inText = findInText(sentenceClean, textAdapted);
start = inText.getRight();
outputStr = inText.getLeft();
start += previousEnd;
} else {
Pair<String, Integer> inText = findInText(sentenceClean, textAdapted);
start = inText.getRight();
outputStr = inText.getLeft();
}
end = start + outputStr.length();
if (start == -1) {
LOGGER.warn("The starting offset is -1. We have tried to recover it, but probably something is still wrong. Please check. ");
LOGGER.warn(outputStr + " / " + textAdapted);
}
} else {
end = start + sentenceClean.length();
}
} else {
end = start + sentenceClean.length();
}
previousStart = start;

if (start > -1) {
previousEnd = end;
}

result.add(new OffsetPosition(start, end));
}

return result;
}

//Use getSentenceOffsets
@Deprecated
protected static List<OffsetPosition> getSentenceOffsetsOld(String text, List<String> retList) {
// build offset positions from the string chunks
List<OffsetPosition> result = new ArrayList<>();
int pos = 0;
int previousEnd = 0;
// indicate when the sentence as provided by the Pragmatic Segmented does not match the original string
// and we had to "massage" the string to identify/approximate offsets in the original string
// and we had to "massage" the string to identify/approximate offsets in the original string
boolean recovered = false;
List<String> retList = (List<String>) ret;
for(int i=0; i<retList.size(); i++) {
String chunk = retList.get(i);
recovered = false;
Expand All @@ -81,7 +223,7 @@ public List<OffsetPosition> detect(String text, Language lang) {
LOGGER.warn("Extracted sentence does not match orginal text - " + chunk);

// Unfortunately the pragmatic segmenter can modify the string when it gives back the array of sentences as string.
// it usually concerns removed white space, which then make it hard to locate exactly the offsets.
// it usually concerns removed white space, which then make it hard to locate exactly the offsets.
// we take as first fallback the previous end of sentence and move it to the next non space character
// next heuristics is to use the next sentence matching to re-synchronize to the original text

Expand All @@ -93,11 +235,11 @@ public List<OffsetPosition> detect(String text, Language lang) {
// "The dissolved oxygen concentration in the sediment was measured in the lab with an OX-500 micro electrode (Unisense, Aarhus, Denmark) and was below detection limit (\0.01 mg l -1 )."
// -> ["The dissolved oxygen concentration in the sediment was measured in the lab with an OX-500 micro electrode (Unisense, Aarhus, Denmark) and was below detection limit (((((((((\\0.01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 )."]
// original full paragraph: Nonylphenol polluted sediment was collected in June 2005 from the Spanish Huerva River in Zaragoza (41°37 0 23 00 N, 0°54 0 28 00 W), which is a tributary of the Ebro River. At the moment of sampling, the river water had a temperature of 25.1°C, a redox potential of 525 mV and a pH of 7.82. The water contained 3.8 mg l -1 dissolved oxygen. The dissolved oxygen concentration in the sediment was measured in the lab with an OX-500 micro electrode (Unisense, Aarhus, Denmark) and was below detection limit (\0.01 mg l -1 ). The redox potential, temperature and pH were not determined in the sediment for practical reasons. Sediment was taken anaerobically with stainless steel cores, and transported on ice to the laboratory. Cores were opened in an anaerobic glove box with ±1% H 2 -gas and ±99% N 2 -gas to maintain anaerobic conditions, and the sediment was put in a glass jar. The glass jar was stored at 4°C in an anaerobic box that was flushed with N 2 -gas. The sediment contained a mixture of tNP isomers (20 mg kg -1 dry weight), but 4-n-NP was not present in the sediment. The chromatogram of the gas chromatography-mass spectrometry (GC-MS) of the mixture of tNP isomers present in the sediment was comparable to the chromatogram of the tNP technical mixture ordered from Merck. The individual branched isomers were not identified. The total organic carbon fraction of the sediment was 3.5% and contained mainly clay particles with a diameter size \ 32 lM.
// it's less frequent that white space removal, but can happen hundred of times when processing thousand PDF
// it's less frequent that white space removal, but can happen hundred of times when processing thousand PDF
// -> note it might be related to jruby sharing of the string and encoding/escaping

if (previousEnd != pos) {
// previous sentence was "recovered", which means we are unsure about its end offset
// previous sentence was "recovered", which means we are unsure about its end offset
start = text.indexOf(chunk, previousEnd);
if (start != -1) {
// apparently the current sentence match a bit before the end offset of the previous sentence, which mean that
Expand All @@ -108,7 +250,7 @@ public List<OffsetPosition> detect(String text, Language lang) {
while(newPreviousEnd >= 1 && text.charAt(newPreviousEnd-1) == ' ') {
newPreviousEnd--;
if (start - newPreviousEnd > 10) {
// this is a break to avoid going too far
// this is a break to avoid going too far
newPreviousEnd = start;
// but look back previous character to cover general case
if (newPreviousEnd >= 1 && text.charAt(newPreviousEnd-1) == ' ') {
Expand All @@ -128,7 +270,7 @@ public List<OffsetPosition> detect(String text, Language lang) {
while(text.charAt(start) == ' ') {
start++;
if (start - previousEnd > 10) {
// this is a break to avoid going too far
// this is a break to avoid going too far
start = previousEnd+1;
}
}
Expand All @@ -139,7 +281,7 @@ public List<OffsetPosition> detect(String text, Language lang) {
int end = start+chunk.length();

// in case the last sentence is modified
if (end > text.length() && i == retList.size()-1)
if (end > text.length() && i == retList.size()-1)
end = text.length();

result.add(new OffsetPosition(start, end));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
* @return list of offset positions for the identified sentence, relative to the input text
*/
public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden, List<LayoutToken> textLayoutTokens, Language lang) {

//String text2 = LayoutTokensUtil.toText(textLayoutTokens);

if (text == null)
return null;
try {
Expand Down Expand Up @@ -174,15 +177,18 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
finalSentencePositions.get(currentSentenceIndex).end);
boolean moved = false;

StringBuilder accumulator = new StringBuilder();

// iterate on layout tokens in sync with sentences
for(int i=0; i<textLayoutTokens.size(); i++) {
LayoutToken token = textLayoutTokens.get(i);
accumulator.append(token);
if (token.getText() == null || token.getText().length() == 0)
continue;

if (this.toSkipToken(token.getText()))
if (toSkipToken(token.getText()))
continue;

//Checking whether the text contains the entire first chunk/sentence
int newPos = sentenceChunk.indexOf(token.getText(), pos);

if (newPos != -1) {
Expand All @@ -206,12 +212,13 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
if (finalSentencePositions.get(currentSentenceIndex).end + nextToken.getText().length() + buffer >= text.length())
break;

if (this.toSkipTokenNoHyphen(nextToken.getText())) {
if (toSkipTokenNoHyphen(nextToken.getText())) {
buffer += nextToken.getText().length();
continue;
}

if (this.isValidSuperScriptNumericalReferenceMarker(nextToken)) {
if (isValidSuperScriptNumericalReferenceMarker(nextToken)
&& isNextTokenFallingIntoAForbiddenInterval(accumulator.length() + j, forbidden)) {
pushedEnd += buffer + nextToken.getText().length();
buffer = 0;
} else
Expand Down Expand Up @@ -266,6 +273,13 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
}
}


private static boolean isNextTokenFallingIntoAForbiddenInterval(int currentOffset, List<OffsetPosition> forbidden) {
return forbidden
.stream().anyMatch(o -> currentOffset >= o.start && currentOffset < o.end);
}


public static List<OffsetPosition> correctSentencePositions(List<OffsetPosition> sentencePositions, List<OffsetPosition> forbiddenPositions) {
List<OffsetPosition> finalSentencePositions = new ArrayList<>();
int forbiddenIndex = 0;
Expand Down Expand Up @@ -311,7 +325,7 @@ static boolean toSkipTokenNoHyphen(String tok) {


/**
* Return true if the token is a valid numerical reference markers ([0-9,())\-\]\[) in supercript.
* Return true if the token is a valid numerical reference markers ([0-9,())\-\]\[) in superscript.
*/
private static boolean isValidSuperScriptNumericalReferenceMarker(LayoutToken token) {

Expand Down
Loading
Loading