Skip to content

Commit

Permalink
Update to latest DeLFT version
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Mar 1, 2019
1 parent 533a178 commit acc71d0
Show file tree
Hide file tree
Showing 28 changed files with 63 additions and 45 deletions.
4 changes: 2 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/jni/DeLFTModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public InitModel(String modelName, File modelPath) {
public void run() {
Jep jep = JEPThreadPool.getInstance().getJEPInstance();
try {
jep.eval(this.modelName+" = sequenceLabelling.Sequence('" + this.modelName.replace("_", "-") + "')");
jep.eval(this.modelName+" = Sequence('" + this.modelName.replace("_", "-") + "')");
jep.eval(this.modelName+".load(dir_path='"+modelPath.getAbsolutePath()+"')");
} catch(JepException e) {
LOGGER.error("DeLFT model initialization failed", e);
Expand Down Expand Up @@ -182,7 +182,7 @@ public void run() {
}

// init model to be trained
jep.eval("model = sequenceLabelling.Sequence('"+this.modelName+
jep.eval("model = Sequence('"+this.modelName+
"', max_epoch=100, recurrent_dropout=0.50, embeddings_name='glove-840B', use_ELMo="+useELMo+")");

// actual training
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,11 @@ public Jep getJEPInstance() {
jep.eval("import numpy as np");
jep.eval("import keras.backend as K");
jep.eval("os.chdir('" + delftPath.getAbsolutePath() + "')");
jep.eval("from utilities.Embeddings import Embeddings");
jep.eval("import sequenceLabelling");
jep.eval("from sequenceLabelling.reader import load_data_and_labels_crf_file");
jep.eval("from sequenceLabelling.reader import load_data_crf_string");
jep.eval("from delft.utilities.Embeddings import Embeddings");
jep.eval("import delft.sequenceLabelling");
jep.eval("from delft.sequenceLabelling import Sequence");
jep.eval("from delft.sequenceLabelling.reader import load_data_and_labels_crf_file");
jep.eval("from delft.sequenceLabelling.reader import load_data_crf_string");
jep.eval("from sklearn.model_selection import train_test_split");
} catch(JepException e) {
LOGGER.error("JEP initialization failed", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
Expand Down Expand Up @@ -209,7 +210,9 @@ private static void annotatePage(PDDocument document,
Pair<Integer, Integer> thePlace = dictionary.get(teiId);
if (thePlace != null) {
PDPageFitWidthDestination destination = new PDPageFitWidthDestination();
destination.setPageNumber(thePlace.getA());
PDPage pdpage = document.getPage(thePlace.getA());
destination.setPage(pdpage);
//destination.setPageNumber(thePlace.getA());
destination.setTop(thePlace.getB());
PDActionGoTo action = new PDActionGoTo();
action.setDestination(destination);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ public static PDDocument annotateFigureAndTables(
String q = XQueryProcessor.getQueryFromResources("figure-table-coords.xq");
String tei = teiDoc.getTei();
if (singleFile) {
System.out.println(tei);
//System.out.println(tei);
}
XQueryProcessor pr = new XQueryProcessor(tei);
SequenceIterator it = pr.getSequenceIterator(q);
Expand All @@ -296,7 +296,7 @@ public static PDDocument annotateFigureAndTables(

//VISUALIZING "IMAGE" elements from pdf2xml
if (visualizePdf2xmlImages) {
q = XQueryProcessor.getQueryFromResources("figure-coords-pdf2xml.xq");
q = XQueryProcessor.getQueryFromResources("figure-coords-pdfalto.xq");

pr = new XQueryProcessor(xmlFile);
it = pr.getSequenceIterator(q);
Expand Down
2 changes: 0 additions & 2 deletions grobid-core/src/main/resources/xq/figure-coords-pdf2xml.xq

This file was deleted.

2 changes: 2 additions & 0 deletions grobid-core/src/main/resources/xq/figure-coords-pdfalto.xq
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
for $i in //Illustration
return string-join(($i/ancestor::PAGE/@number, $i/@HPOS, $i/@VPOS, $i/@WIDTH, $i/@HEIGHT), ',')
23 changes: 11 additions & 12 deletions grobid-home/models/affiliation-address/config.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
{
"fold_number": 1,
"case_vocab_size": 8,
"use_ELMo": false,
"char_vocab_size": 152,
"model_name": "affiliation-address",
"dropout": 0.5,
"num_char_lstm_units": 25,
"num_word_lstm_units": 100,
"use_char_feature": true,
"word_embedding_size": 300,
"model_type": "BidLSTM_CRF",
"fold_number": 1,
"embeddings_name": "glove-840B",
"use_crf": true,
"num_char_lstm_units": 25,
"max_char_length": 30,
"recurrent_dropout": 0.5,
"recurrent_dropout": 0.25,
"model_name": "affiliation-address",
"num_word_lstm_units": 100,
"char_embedding_size": 25,
"char_vocab_size": 149,
"batch_size": 20,
"use_char_feature": true,
"model_type": "BidLSTM_CRF",
"case_embedding_size": 5,
"batch_size": 20
"dropout": 0.5,
"use_crf": true
}
Binary file modified grobid-home/models/affiliation-address/model_weights.hdf5
Binary file not shown.
Binary file modified grobid-home/models/affiliation-address/preprocessor.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion grobid-home/models/citation/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
"use_char_feature": true,
"embeddings_name": "glove-840B",
"word_embedding_size": 300
}
}
Binary file modified grobid-home/models/citation/preprocessor.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion grobid-home/models/date/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
"case_embedding_size": 5,
"num_char_lstm_units": 25,
"dropout": 0.5
}
}
Binary file modified grobid-home/models/date/preprocessor.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion grobid-home/models/figure/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
"case_embedding_size": 5,
"use_char_feature": true,
"max_char_length": 30
}
}
Binary file modified grobid-home/models/figure/preprocessor.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion grobid-home/models/header/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
"word_embedding_size": 300,
"embeddings_name": "glove-840B",
"use_char_feature": true
}
}
Binary file modified grobid-home/models/header/preprocessor.pkl
Binary file not shown.
20 changes: 10 additions & 10 deletions grobid-home/models/name-citation/config.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
{
"case_vocab_size": 8,
"max_char_length": 30,
"fold_number": 1,
"use_crf": true,
"batch_size": 20,
"case_embedding_size": 5,
"num_char_lstm_units": 25,
"embeddings_name": "glove-840B",
"model_name": "name-citation",
"model_type": "BidLSTM_CRF",
"recurrent_dropout": 0.5,
"char_vocab_size": 84,
"use_char_feature": true,
"max_char_length": 30,
"word_embedding_size": 300,
"case_embedding_size": 5,
"batch_size": 20,
"recurrent_dropout": 0.5,
"case_vocab_size": 8,
"use_ELMo": false,
"model_name": "name-citation",
"num_word_lstm_units": 100,
"use_crf": true,
"num_char_lstm_units": 25,
"model_type": "BidLSTM_CRF",
"char_vocab_size": 84,
"char_embedding_size": 25,
"dropout": 0.5
}
Binary file modified grobid-home/models/name-citation/model_weights.hdf5
Binary file not shown.
Binary file modified grobid-home/models/name-citation/preprocessor.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion grobid-home/models/name-header/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
"char_vocab_size": 161,
"use_char_feature": true,
"dropout": 0.5
}
}
Binary file modified grobid-home/models/name-header/preprocessor.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion grobid-home/models/reference-segmenter/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
"char_embedding_size": 25,
"dropout": 0.5,
"use_ELMo": false
}
}
Binary file modified grobid-home/models/reference-segmenter/preprocessor.pkl
Binary file not shown.
4 changes: 2 additions & 2 deletions grobid-home/models/table/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"embeddings_name": "glove-840B",
"dropout": 0.5,
"case_embedding_size": 5,
"model_name": "table",
"model_name": "grobid-table",
"model_type": "BidLSTM_CRF",
"num_word_lstm_units": 100,
"char_vocab_size": 109,
Expand All @@ -17,4 +17,4 @@
"recurrent_dropout": 0.5,
"word_embedding_size": 300,
"use_crf": true
}
}
Binary file modified grobid-home/models/table/preprocessor.pkl
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -767,8 +767,12 @@ protected PDDocument annotate(File originFile,
PDDocument outputDocument = null;
// list of TEI elements that should come with coordinates
List<String> elementWithCoords = new ArrayList<>();
elementWithCoords.add("ref");
elementWithCoords.add("biblStruct");
if (type == GrobidRestUtils.Annotation.CITATION) {
elementWithCoords.add("ref");
elementWithCoords.add("biblStruct");
} else if (type == GrobidRestUtils.Annotation.FIGURE) {
elementWithCoords.add("figure");
}

GrobidAnalysisConfig config = new GrobidAnalysisConfig
.GrobidAnalysisConfigBuilder()
Expand All @@ -777,18 +781,25 @@ protected PDDocument annotate(File originFile,
.generateTeiCoordinates(elementWithCoords)
.build();

Document teiDoc = engine.fullTextToTEIDoc(originFile, config);
DocumentSource documentSource =
DocumentSource.fromPdf(originFile, config.getStartPage(), config.getEndPage(), true, true, false);

Document teiDoc = engine.fullTextToTEIDoc(documentSource, config);

documentSource =
DocumentSource.fromPdf(originFile, config.getStartPage(), config.getEndPage(), true, true, false);

PDDocument document = PDDocument.load(originFile);
//If no pages, skip the document
if (document.getNumberOfPages() > 0) {
DocumentSource documentSource = teiDoc.getDocumentSource();
outputDocument = dispatchProcessing(type, document, documentSource, teiDoc);
} else {
throw new RuntimeException("Cannot identify any pages in the input document. " +
"The document cannot be annotated. Please check whether the document is valid or the logs.");
}

documentSource.close(true, true, false);

return outputDocument;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3505,5 +3505,9 @@
<bibl><author>A. Oliva, A. Torralba</author>, "<title level="a">Modeling the shape of the scene: A holistic representation of the spatial envelope</title>", <title level="j">International journal of computer vision</title>, <date>2001</date>.</bibl>
<bibl><author>Boubakri, Narjess, Ghoul, Sadok El, and Saffar, Walid</author>, <date>2012</date>, <title level="a">Cash Holdings of Politically Connected Firms</title>, <note type="report">Working Paper</note>, <orgName>Hong Kong Polytechnic University</orgName>.</bibl>
<bibl><author>Jensen, M.C., Meckling, W.H.</author>, <date>1978</date>. <title level="a">Can the corporation survive?</title> <title level="j">Financial Analysts Journal</title>, <biblScope type="vol">34</biblScope>: <biblScope type="pp">31-37</biblScope>.</bibl>
<bibl><title level="m">Pdflib tet</title>. <ptr type="web">http://www.pdflib.com/products/tet/</ptr>. Accessed: <date>2015-05-12</date>.</bibl>
<bibl><title level="m">Poppler</title>. <ptr type="web">http://poppler.freedesktop.org</ptr>. Accessed: <date>2015-05-12</date>.</bibl>
<bibl><title level="m">Tabula</title>. <ptr type="web">http://tabula.technology</ptr>. Accessed: <date>2015-05-13</date>.</bibl>

</listBibl>
</tei>

0 comments on commit acc71d0

Please sign in to comment.