Class PDFEXTparser
- java.lang.Object
-
- edu.upf.taln.dri.common.connector.pdfext.localappo.PDFEXTparser
-
public class PDFEXTparser extends Object
import org.grobid.core.*; import org.grobid.core.data.*; import org.grobid.core.factory.*; import org.grobid.core.mock.*; import org.grobid.core.utilities.*; import org.grobid.core.engines.Engine;
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description static class
PDFEXTparser.State
-
Field Summary
-
Constructor Summary
Constructors Constructor Description PDFEXTparser()
//GROBID engige object protected static Engine engine;
-
Method Summary
All Methods Static Methods Concrete Methods Modifier and Type Method Description static boolean
checkPartOfPreviousReferenceInFirstPosColumn(org.jsoup.nodes.Element divtag, org.jsoup.select.Elements divtags2, int current_index)
static boolean
checkReferenceByFirstLineIndentation(org.jsoup.nodes.Element divtag, org.jsoup.select.Elements divtags, int current_index)
static boolean
childrenNodeHasTextFontAttribute(org.jsoup.nodes.Element divtag, String strPattern_TEXT)
static void
clearData()
static String
closeStateAnnexes(edu.upf.taln.dri.common.connector.pdfext.localappo.PDFEXTparser.StateAnnexes currentStateAnnexes)
static String
closeStateSection(edu.upf.taln.dri.common.connector.pdfext.localappo.PDFEXTparser.StateSection currentStateSection, Boolean flagParagraphTagOpen)
static int
consume_FOOTNOTE(org.jsoup.nodes.Element divtagIndexNumber, org.jsoup.nodes.Element divtagTextFirstLine, org.jsoup.select.Elements divtagsPages, double fontSizeTextValue, int current_index)
static void
countsCSSAttributeLabels(org.jsoup.nodes.Document doc)
static int
detectAverageTextYAxisDistance(org.jsoup.nodes.Document doc, String strPattern_TEXT)
static double[]
detectColumnsXAxisValue()
static double
detectFontSizeSectionTitlesWithoutNumbering(org.jsoup.nodes.Document doc, double fontSizeValueText, double fontSizeValueTitle)
static boolean
detectFootNoteStart(org.jsoup.nodes.Element divtagIndexNumber, org.jsoup.nodes.Element divtagTextFirstLine, double fontSizeTextValue)
static String
detectTITLEFontSizeAttribute(org.jsoup.nodes.Document doc)
static void
filterOutFootnotes(org.jsoup.nodes.Document doc, int numColumns, String fontFamilyText, String fontSizeText)
static void
filterOutRunningHeadsAndPageNumbers(org.jsoup.nodes.Document doc)
static void
filterOutRunningHeadsAndPageNumbers2(org.jsoup.nodes.Document doc, String fontFamilyText, String fontSizeText)
static void
filterOutTableContents(org.jsoup.nodes.Document doc, int numColumns, String fontFamilyText, String fontSizeText, String strMostUsedHeightAttribute)
static void
generateDivUniqueID(org.jsoup.nodes.Document doc)
static String
getMostUsedFontFamilyAttributeInDivTags()
static String
getMostUsedFontSizeAttributeInDivTags()
static String
getMostUsedHeightAttributeInDivTags()
static void
markDivTagHTMLOutput(org.jsoup.nodes.Element divtag, String articlePartType)
static boolean
nextLineIsNormalText(org.jsoup.select.Elements divtags, int current_index, String strPattern_TEXT)
static PDFEXTresult
parse(PDFEXTresult contentsToParse)
public static String extractHEADERS_GROBID(String PDFfilePath) { BiblioItem resHeader=null; try { // Biblio object for the result resHeader = new BiblioItem(); String tei = engine.processHeader(PDFfilePath, false, resHeader); System.out.println(">>>>>>>> GROBID_RESULTS="+resHeader.toString()); } catch (Exception e) { // If an exception is generated, print a stack trace e.printStackTrace(); } return resHeader.toString(); } /*************************** /** Parsing document methodstatic void
storesCSSAttributeValues(org.jsoup.nodes.Document doc)
-
-
-
Field Detail
-
hyphenWordsDictionaryEN
protected static HyphenWordsDictionary hyphenWordsDictionaryEN
-
jastTagsDictionary
protected static AbstractDictionary jastTagsDictionary
-
htmlArticlePartTypeColorsDictionary
protected static AbstractDictionary htmlArticlePartTypeColorsDictionary
-
regexpMatcher
protected static RegexpMatcher regexpMatcher
-
yAxisAverageTextDiff
protected static int yAxisAverageTextDiff
-
firstColumnXAxisValue
protected static double firstColumnXAxisValue
-
secondColumnXAxisValue
protected static double secondColumnXAxisValue
-
numColumns
protected static int numColumns
-
bottomValueThresholdForTITLE
protected static double bottomValueThresholdForTITLE
-
minThresholdReferenceDetectByFirstLineIndent
protected static double minThresholdReferenceDetectByFirstLineIndent
-
maxThresholdReferenceDetectByFirstLineIndent
protected static double maxThresholdReferenceDetectByFirstLineIndent
-
maxThresholdReferenceWithinLineBottomDistance
protected static double maxThresholdReferenceWithinLineBottomDistance
-
maxValuePagePxPercentThresholdFilterRunningHeads
protected static double maxValuePagePxPercentThresholdFilterRunningHeads
-
minValuePagePxPercentThresholdFilterRunningHeads
protected static double minValuePagePxPercentThresholdFilterRunningHeads
-
secondLineReferenceXAxisLeftValue
protected static double secondLineReferenceXAxisLeftValue
-
maxThresholdXAxisToIndentReference
protected static double maxThresholdXAxisToIndentReference
-
maxThresholdYAxisFootnoteIndex
protected static double maxThresholdYAxisFootnoteIndex
-
OFFSET_ColumnXAxisValue
protected static double OFFSET_ColumnXAxisValue
-
OFFSET_yAxisAverageTextDiff
protected static double OFFSET_yAxisAverageTextDiff
-
maxOFFSETxAxisIndexFootnote
protected static double maxOFFSETxAxisIndexFootnote
-
biggestYAxis_OFFSET
protected static double biggestYAxis_OFFSET
-
smallestYAxis_OFFSET
protected static double smallestYAxis_OFFSET
-
-
Method Detail
-
parse
public static PDFEXTresult parse(PDFEXTresult contentsToParse)
public static String extractHEADERS_GROBID(String PDFfilePath) { BiblioItem resHeader=null; try { // Biblio object for the result resHeader = new BiblioItem(); String tei = engine.processHeader(PDFfilePath, false, resHeader); System.out.println(">>>>>>>> GROBID_RESULTS="+resHeader.toString()); } catch (Exception e) { // If an exception is generated, print a stack trace e.printStackTrace(); } return resHeader.toString(); } /*************************** /** Parsing document method- Parameters:
doc
-- Returns:
-
checkPartOfPreviousReferenceInFirstPosColumn
public static boolean checkPartOfPreviousReferenceInFirstPosColumn(org.jsoup.nodes.Element divtag, org.jsoup.select.Elements divtags2, int current_index)
-
checkReferenceByFirstLineIndentation
public static boolean checkReferenceByFirstLineIndentation(org.jsoup.nodes.Element divtag, org.jsoup.select.Elements divtags, int current_index)
-
markDivTagHTMLOutput
public static void markDivTagHTMLOutput(org.jsoup.nodes.Element divtag, String articlePartType)
-
closeStateSection
public static String closeStateSection(edu.upf.taln.dri.common.connector.pdfext.localappo.PDFEXTparser.StateSection currentStateSection, Boolean flagParagraphTagOpen)
-
closeStateAnnexes
public static String closeStateAnnexes(edu.upf.taln.dri.common.connector.pdfext.localappo.PDFEXTparser.StateAnnexes currentStateAnnexes)
-
childrenNodeHasTextFontAttribute
public static boolean childrenNodeHasTextFontAttribute(org.jsoup.nodes.Element divtag, String strPattern_TEXT)
-
nextLineIsNormalText
public static boolean nextLineIsNormalText(org.jsoup.select.Elements divtags, int current_index, String strPattern_TEXT)
-
clearData
public static void clearData()
-
detectTITLEFontSizeAttribute
public static String detectTITLEFontSizeAttribute(org.jsoup.nodes.Document doc)
-
countsCSSAttributeLabels
public static void countsCSSAttributeLabels(org.jsoup.nodes.Document doc)
-
storesCSSAttributeValues
public static void storesCSSAttributeValues(org.jsoup.nodes.Document doc)
-
detectColumnsXAxisValue
public static double[] detectColumnsXAxisValue()
-
detectAverageTextYAxisDistance
public static int detectAverageTextYAxisDistance(org.jsoup.nodes.Document doc, String strPattern_TEXT)
-
getMostUsedFontSizeAttributeInDivTags
public static String getMostUsedFontSizeAttributeInDivTags()
-
getMostUsedFontFamilyAttributeInDivTags
public static String getMostUsedFontFamilyAttributeInDivTags()
-
getMostUsedHeightAttributeInDivTags
public static String getMostUsedHeightAttributeInDivTags()
-
detectFontSizeSectionTitlesWithoutNumbering
public static double detectFontSizeSectionTitlesWithoutNumbering(org.jsoup.nodes.Document doc, double fontSizeValueText, double fontSizeValueTitle)
-
generateDivUniqueID
public static void generateDivUniqueID(org.jsoup.nodes.Document doc)
-
filterOutRunningHeadsAndPageNumbers2
public static void filterOutRunningHeadsAndPageNumbers2(org.jsoup.nodes.Document doc, String fontFamilyText, String fontSizeText)
-
filterOutRunningHeadsAndPageNumbers
public static void filterOutRunningHeadsAndPageNumbers(org.jsoup.nodes.Document doc)
-
filterOutFootnotes
public static void filterOutFootnotes(org.jsoup.nodes.Document doc, int numColumns, String fontFamilyText, String fontSizeText)
-
detectFootNoteStart
public static boolean detectFootNoteStart(org.jsoup.nodes.Element divtagIndexNumber, org.jsoup.nodes.Element divtagTextFirstLine, double fontSizeTextValue)
-
consume_FOOTNOTE
public static int consume_FOOTNOTE(org.jsoup.nodes.Element divtagIndexNumber, org.jsoup.nodes.Element divtagTextFirstLine, org.jsoup.select.Elements divtagsPages, double fontSizeTextValue, int current_index)
-
-