The Technology Hub: How to extract data from PDF files using Apache Tika? (31 of 285 technotes for 2015)

Sunday, April 19, 2015

How to extract data from PDF files using Apache Tika? (31 of 285 technotes for 2015)

The Apache Tika™ toolkit detects and extracts metadata and text from over a thousand different file types (such as PPT, XLS, and PDF). All of these file types can be parsed through a single interface, making Tika useful for search engine indexing, content analysis, translation, and much more.

The code below shows a simple example as to how to use Apache Tika.

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
 
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
 
public class Sample {
 
    public static void main(String[] args){
        InputStream is = null;
 
        try {
            is = new BufferedInputStream(new FileInputStream(new File("sample.pdf")));
 
            Parser parser = new AutoDetectParser();
            ContentHandler handler = new BodyContentHandler(System.out);
 
            Metadata metadata = new Metadata();
 
            parser.parse(is, handler, metadata, new ParseContext());
 
            for (String name : metadata.names()) {
                String value = metadata.get(name);
 
                if (value != null) {
                    System.out.println("Metadata Name:  " + name);
                    System.out.println("Metadata Value: " + value);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (TikaException e) {
            e.printStackTrace();
        } catch (SAXException e) {
            e.printStackTrace();
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch(IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

<?xml version="1.0" ?>
<project
    xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4.0.0.xsd">
 
    <modelVersion>4.0.0</modelVersion>
    <groupId>net.kinjouj.tika</groupId>
    <artifactId>kinjouj_tika</artifactId>
    <version>1.0</version>
    <name>kinjouj_tika</name>
 
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <source>1.6</source>
                    <target>1.6</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.codehaus.mojo</groupId>
                <artifactId>exec-maven-plugin</artifactId>
                <version>1.2.1</version>
                 <configuration>
                     <mainClass>Sample</mainClass>
                 </configuration>
            </plugin>
        </plugins>
    </build>
 
    <dependencies>
        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-core</artifactId>
            <version>1.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-parsers</artifactId>
            <version>1.1</version>
        </dependency>
    </dependencies>
</project>

https://gist.github.com/kinjouj/2507727

1 comment:

cynthiawilliams said...: Excellent post, it will be definitely helpful for many people. Keep posting more like this. AWS Training in Chennai
AWS course in Chennai
DevOps certification in Chennai
DevOps Training in Chennai
Data Science Course in Chennai
Data Science Training in Chennai
AWS Training in Velachery
AWS Training in Tambaram; January 23, 2019 at 12:03 AM