Sunday, April 19, 2015

How to extract data from PDF files using Apache Tika? (31 of 285 technotes for 2015)



The Apache Tika™ toolkit detects and extracts metadata and text from over a thousand different file types (such as PPT, XLS, and PDF). All of these file types can be parsed through a single interface, making Tika useful for search engine indexing, content analysis, translation, and much more.


The code below shows a simple example as to how to use Apache Tika.

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class Sample {

   public static void main(String[] args){
       InputStream is = null;

       try {
           is = new BufferedInputStream(new FileInputStream(new File("sample.pdf")));

           Parser parser = new AutoDetectParser();
           ContentHandler handler = new BodyContentHandler(System.out);

           Metadata metadata = new Metadata();

           parser.parse(is, handler, metadata, new ParseContext());

           for (String name : metadata.names()) {
               String value = metadata.get(name);

               if (value != null) {
                   System.out.println("Metadata Name:  " + name);
                   System.out.println("Metadata Value: " + value);
               }
           }
       } catch (IOException e) {
           e.printStackTrace();
       } catch (TikaException e) {
           e.printStackTrace();
       } catch (SAXException e) {
           e.printStackTrace();
       } finally {
           if (is != null) {
               try {
                   is.close();
               } catch(IOException e) {
                   e.printStackTrace();
               }
           }
       }
   }
}

<?xml version="1.0" ?>
<project
   xmlns="http://maven.apache.org/POM/4.0.0"
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4.0.0.xsd">

   <modelVersion>4.0.0</modelVersion>
   <groupId>net.kinjouj.tika</groupId>
   <artifactId>kinjouj_tika</artifactId>
   <version>1.0</version>
   <name>kinjouj_tika</name>

   <build>
       <plugins>
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
               <artifactId>maven-compiler-plugin</artifactId>
               <version>2.3.2</version>
               <configuration>
                   <source>1.6</source>
                   <target>1.6</target>
                   <encoding>UTF-8</encoding>
               </configuration>
           </plugin>
           <plugin>
               <groupId>org.codehaus.mojo</groupId>
               <artifactId>exec-maven-plugin</artifactId>
               <version>1.2.1</version>
                <configuration>
                    <mainClass>Sample</mainClass>
                </configuration>
           </plugin>
       </plugins>
   </build>

   <dependencies>
       <dependency>
           <groupId>org.apache.tika</groupId>
           <artifactId>tika-core</artifactId>
           <version>1.1</version>
       </dependency>
       <dependency>
           <groupId>org.apache.tika</groupId>
           <artifactId>tika-parsers</artifactId>
           <version>1.1</version>
       </dependency>
   </dependencies>
</project>

https://gist.github.com/kinjouj/2507727