package com.pdflib.cookbook.tet.special; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import com.pdflib.TET; import com.pdflib.TETException; /** * Restrict text extraction to a particular "region of interest", i.e. some area * on the page based on knowledge about the document layout. This can easily be * implemented with the "includebox" and "excludebox" options of * TET_open_page(). * You can visualize page coordinates in Acrobat as follows: * - To display cursor coordinates in Acrobat X/XI/DC use View, Show/Hide, * Cursor Coordinates. * - The coordinates are displayed in the unit which is currently selected in * Acrobat. To change the display units to points (as used in TET) in * Acrobat X/XI/DC proceed as follows: * go to Edit, Preferences, Units & Guides, Units and select Points. * Required software: TET 3 * Required data: PDF document * * @version $Id: region_of_interest.java,v 1.8 2017/03/20 16:11:01 tm Exp $ */ class region_of_interest { /** * Global option list. The program expects the "resource" directory parallel * to the "java" directory. */ private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}"; /** * Document specific option list. */ private static final String DOC_OPTLIST = ""; /** * Page-specific option list. Here we define the region(s) of interest * via an "includebox" list of rectangles. In this case we define the * includebox so the footer line of the input document is not included. *
* As an alternative the footer line could be excluded with the * "excludebox" option. To try this with the input document, replace * the "includebox" option below with the following: *
* excludebox={{0 0 430 70}}
*/
private static final String PAGE_OPTLIST =
"granularity=page includebox={{30 70 430 670}}";
/**
* The encoding in which the output is sent to System.out. For running
* the example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING =
System.getProperty("file.encoding");
/**
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/**
* The name of the input file
*/
private String filename;
/**
* Process a page from the input document.
*
* @param tet
* TET object
* @param doc
* TET document handle
* @param pageno
* Page to process
*
* @throws TETException
* An error occurred in the TET API
*/
private static void process_page(TET tet, final int doc, int pageno)
throws TETException {
final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page == -1) {
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
}
else {
/* Retrieve all text fragments for the page */
for (String text = tet.get_text(page); text != null; text = tet
.get_text(page)) {
out.println(text);
}
if (tet.get_errnum() != 0) {
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
}
tet.close_page(page);
}
}
private void execute() {
TET tet = null;
int pageno = 0;
try {
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
final int doc = tet.open_document(filename, DOC_OPTLIST);
if (doc == -1) {
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
return;
}
/*
* Loop over pages in the document
*/
final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
for (pageno = 1; pageno <= n_pages; ++pageno) {
process_page(tet, doc, pageno);
}
tet.close_document(doc);
}
catch (TETException e) {
if (pageno == 0) {
System.err.println("Error " + e.get_errnum() + " in "
+ e.get_apiname() + "(): " + e.get_errmsg() + "\n");
}
else {
System.err.println("Error " + e.get_errnum() + " in "
+ e.get_apiname() + "() on page " + pageno + ": "
+ e.get_errmsg() + "\n");
}
System.exit(1);
}
finally {
tet.delete();
}
}
/**
* @param filename
* the name of the file for which the template will be
* generated
*/
private region_of_interest(String filename) {
this.filename = filename;
}
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
if (args.length != 1) {
out.println("usage: region_of_interest