Class PDFEXTparser


  • public class PDFEXTparser
    extends Object
    import org.grobid.core.*; import org.grobid.core.data.*; import org.grobid.core.factory.*; import org.grobid.core.mock.*; import org.grobid.core.utilities.*; import org.grobid.core.engines.Engine;
    • Field Detail

      • htmlArticlePartTypeColorsDictionary

        protected static AbstractDictionary htmlArticlePartTypeColorsDictionary
      • mapCountsCSSHeights

        protected static Map<String,​Integer> mapCountsCSSHeights
      • mapCountsCSSFontSize

        protected static Map<String,​Integer> mapCountsCSSFontSize
      • mapCountsCSSFontFamily

        protected static Map<String,​Integer> mapCountsCSSFontFamily
      • mapCountsCSSBottom

        protected static Map<String,​Integer> mapCountsCSSBottom
      • mapValuesCSSHeights

        protected static Map<String,​Double> mapValuesCSSHeights
      • mapValuesCSSFontSize

        protected static Map<String,​Double> mapValuesCSSFontSize
      • mapValuesCSSBottom

        protected static Map<String,​Double> mapValuesCSSBottom
      • mapValuesCSSLeft

        protected static Map<String,​Double> mapValuesCSSLeft
      • yAxisAverageTextDiff

        protected static int yAxisAverageTextDiff
      • firstColumnXAxisValue

        protected static double firstColumnXAxisValue
      • secondColumnXAxisValue

        protected static double secondColumnXAxisValue
      • numColumns

        protected static int numColumns
      • bottomValueThresholdForTITLE

        protected static double bottomValueThresholdForTITLE
      • minThresholdReferenceDetectByFirstLineIndent

        protected static double minThresholdReferenceDetectByFirstLineIndent
      • maxThresholdReferenceDetectByFirstLineIndent

        protected static double maxThresholdReferenceDetectByFirstLineIndent
      • maxThresholdReferenceWithinLineBottomDistance

        protected static double maxThresholdReferenceWithinLineBottomDistance
      • maxValuePagePxPercentThresholdFilterRunningHeads

        protected static double maxValuePagePxPercentThresholdFilterRunningHeads
      • minValuePagePxPercentThresholdFilterRunningHeads

        protected static double minValuePagePxPercentThresholdFilterRunningHeads
      • secondLineReferenceXAxisLeftValue

        protected static double secondLineReferenceXAxisLeftValue
      • maxThresholdXAxisToIndentReference

        protected static double maxThresholdXAxisToIndentReference
      • maxThresholdYAxisFootnoteIndex

        protected static double maxThresholdYAxisFootnoteIndex
      • OFFSET_ColumnXAxisValue

        protected static double OFFSET_ColumnXAxisValue
      • OFFSET_yAxisAverageTextDiff

        protected static double OFFSET_yAxisAverageTextDiff
      • maxOFFSETxAxisIndexFootnote

        protected static double maxOFFSETxAxisIndexFootnote
      • biggestYAxis_OFFSET

        protected static double biggestYAxis_OFFSET
      • smallestYAxis_OFFSET

        protected static double smallestYAxis_OFFSET
    • Constructor Detail

      • PDFEXTparser

        public PDFEXTparser()
        //GROBID engige object protected static Engine engine;
    • Method Detail

      • parse

        public static PDFEXTresult parse​(PDFEXTresult contentsToParse)
        public static String extractHEADERS_GROBID(String PDFfilePath) { BiblioItem resHeader=null; try { // Biblio object for the result resHeader = new BiblioItem(); String tei = engine.processHeader(PDFfilePath, false, resHeader); System.out.println(">>>>>>>> GROBID_RESULTS="+resHeader.toString()); } catch (Exception e) { // If an exception is generated, print a stack trace e.printStackTrace(); } return resHeader.toString(); } /*************************** /** Parsing document method
        Parameters:
        doc -
        Returns:
      • checkPartOfPreviousReferenceInFirstPosColumn

        public static boolean checkPartOfPreviousReferenceInFirstPosColumn​(org.jsoup.nodes.Element divtag,
                                                                           org.jsoup.select.Elements divtags2,
                                                                           int current_index)
      • checkReferenceByFirstLineIndentation

        public static boolean checkReferenceByFirstLineIndentation​(org.jsoup.nodes.Element divtag,
                                                                   org.jsoup.select.Elements divtags,
                                                                   int current_index)
      • markDivTagHTMLOutput

        public static void markDivTagHTMLOutput​(org.jsoup.nodes.Element divtag,
                                                String articlePartType)
      • closeStateSection

        public static String closeStateSection​(edu.upf.taln.dri.common.connector.pdfext.localappo.PDFEXTparser.StateSection currentStateSection,
                                               Boolean flagParagraphTagOpen)
      • closeStateAnnexes

        public static String closeStateAnnexes​(edu.upf.taln.dri.common.connector.pdfext.localappo.PDFEXTparser.StateAnnexes currentStateAnnexes)
      • childrenNodeHasTextFontAttribute

        public static boolean childrenNodeHasTextFontAttribute​(org.jsoup.nodes.Element divtag,
                                                               String strPattern_TEXT)
      • nextLineIsNormalText

        public static boolean nextLineIsNormalText​(org.jsoup.select.Elements divtags,
                                                   int current_index,
                                                   String strPattern_TEXT)
      • clearData

        public static void clearData()
      • detectTITLEFontSizeAttribute

        public static String detectTITLEFontSizeAttribute​(org.jsoup.nodes.Document doc)
      • countsCSSAttributeLabels

        public static void countsCSSAttributeLabels​(org.jsoup.nodes.Document doc)
      • storesCSSAttributeValues

        public static void storesCSSAttributeValues​(org.jsoup.nodes.Document doc)
      • detectColumnsXAxisValue

        public static double[] detectColumnsXAxisValue()
      • detectAverageTextYAxisDistance

        public static int detectAverageTextYAxisDistance​(org.jsoup.nodes.Document doc,
                                                         String strPattern_TEXT)
      • getMostUsedFontSizeAttributeInDivTags

        public static String getMostUsedFontSizeAttributeInDivTags()
      • getMostUsedFontFamilyAttributeInDivTags

        public static String getMostUsedFontFamilyAttributeInDivTags()
      • getMostUsedHeightAttributeInDivTags

        public static String getMostUsedHeightAttributeInDivTags()
      • detectFontSizeSectionTitlesWithoutNumbering

        public static double detectFontSizeSectionTitlesWithoutNumbering​(org.jsoup.nodes.Document doc,
                                                                         double fontSizeValueText,
                                                                         double fontSizeValueTitle)
      • generateDivUniqueID

        public static void generateDivUniqueID​(org.jsoup.nodes.Document doc)
      • filterOutRunningHeadsAndPageNumbers2

        public static void filterOutRunningHeadsAndPageNumbers2​(org.jsoup.nodes.Document doc,
                                                                String fontFamilyText,
                                                                String fontSizeText)
      • filterOutRunningHeadsAndPageNumbers

        public static void filterOutRunningHeadsAndPageNumbers​(org.jsoup.nodes.Document doc)
      • filterOutFootnotes

        public static void filterOutFootnotes​(org.jsoup.nodes.Document doc,
                                              int numColumns,
                                              String fontFamilyText,
                                              String fontSizeText)
      • detectFootNoteStart

        public static boolean detectFootNoteStart​(org.jsoup.nodes.Element divtagIndexNumber,
                                                  org.jsoup.nodes.Element divtagTextFirstLine,
                                                  double fontSizeTextValue)
      • consume_FOOTNOTE

        public static int consume_FOOTNOTE​(org.jsoup.nodes.Element divtagIndexNumber,
                                           org.jsoup.nodes.Element divtagTextFirstLine,
                                           org.jsoup.select.Elements divtagsPages,
                                           double fontSizeTextValue,
                                           int current_index)
      • filterOutTableContents

        public static void filterOutTableContents​(org.jsoup.nodes.Document doc,
                                                  int numColumns,
                                                  String fontFamilyText,
                                                  String fontSizeText,
                                                  String strMostUsedHeightAttribute)