#!/usr/local/bin/perl
#
# Copyright (c) 1998-1999, Shigeru Ishida (ishida@isl.intec.co.jp)
#
# Permission to use, copy, modify, distribute, and its documentation,
# in whole or in part, for any purpose, is hereby granted, provided that:
#
# 1. This copyright and permission notice appear in all copies of the
# software and its documentation. Notices of copyright or attribution
# which appear at the beginning of any file must remain unchanged.
#
# 2. The name of Shigeru Ishida may not be used to endorse or 
# promote products derived from this software without specific
# prior written permission.
#
# THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,
# EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
# WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
# IN NO EVENT SHALL SHIGERU ISHIDA BE LIABLE FOR ANY SPECIAL, INCIDENTAL,
# INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER OR
# NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
# LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
#
# --
# PDF2TXT is a simple tool to extract text from PDF files.
#
# History:
# 0.81	1999.11.21
#   - Add '-i' option for extracting document information.
#     Thanks to Masaaki Kondo <Masaaki_Kondo@haseko.co.jp>.
#
# 0.80	1999.10.13
#   - Check font size to decide whether inserts a space code between words
#     or not.
#
# 0.79	1999.10.7
#   - Modify dealing with text positioning operators.
#
# 0.78	1999.10.6
#   - Check Type3 font encoding.
#
# 0.77	1999.10.5
#   - Modify dealing with text positioning operators.
#
# 0.76	1999.10.4
#   - Fix bug in reading trailer information.
#
# 0.75	1999.9.22
#   - Fix bug in reading cross-reference table.
#   - Fix bug in extracting string data from the stream decoded.
#
# 0.74	1999.4.26
#   - Correspond to Ext-H CMap.
#
# 0.73	1999.4.24
#   - Modify cid2code().
#
# 0.72	1999.4.23
#   - Correspond to CMaps for Katakana and Hankaku.
#     Thanks to Rei FURUKAWA <furukawa@tcp-ip.or.jp>.
#
# 0.71	1999.4.22
#   - Set start offset (in byte) of PDF content. In case of MacBinary II format,
#     it's offset is 128 bytes. Thanks to Rei FURUKAWA <furukawa@tcp-ip.or.jp>.
#
# 0.70	1999.4.14
#   - Modify cid2code().
#   - Fix bug in dealing with predefined font encodings.
#
# 0.66	1999.4.14
#   - Fix bug in read_encodings().
#
# 0.65	1999.4.13
#   - Add $txtfh FileHandle variable and delete TXTFH FileHandle variable.
#   - Correspond to modify &CMap::cid2code().
#   - Modify add_space_in_line() and add_space_in_word().
#
# 0.64	1999.4.9
#   - Change FileHandle from STDOUT to STDERR to output a message about loading
#     CMap files.
#   - Correspond to modify &CMap::cid2code().
#
# 0.63	1999.4.8
#   - Add two options to indicate a range of pages from which text is extracted.
#       -f <num> : first page
#       -l <num> : last page
#
# 0.62	1999.4.7
#   - Fix bug in dealing with predefined font encodings.
#
# 0.61	1999.4.7
#   - Fix bug in dealing with predefined font encodings.
#
# 0.60	1999.4.7
#   - Correspond to predefined font encodings below:
#       StandardEncoding
#       SymbolEncoding
#       ZapfDingbatsEncoding
#       MacRomanEncoding
#       WinAnsiEncoding
#   - Remove '-o' option for omitting character code \ddd (octal).
#
# 0.57	1999.4.3
#   - Correspond to Type 0 fonts and use CMap.pl to convert CID binary strings
#     into character codes.
#
# 0.56	1999.4.2
#   * Read all names of encoding methods of all subtypes as well as Type0 fonts.
#
# 0.55	1999.4.2
#   * Modify dealing with unknown decode-filters.
#
# 0.54	1999.4.2
#   * Fix bug in extracting string data from the stream decoded.
#
# 0.53	1999.3.31
#   * Add PDFLZW.pl.
#
# 0.52	1999.3.31
#   * Add Base85.pl.
#   * Remove '-f' option for base85-decoding if length of base85 data not
#     a multiple of 5.
#   * Remove '-h' option to delete a hyphen as a word separator just before
#     newline.
#
# 0.51	1999.3.30
#   * Read names of encoding methods of Type0 fonts.
#
# 0.50	1999.3.27
#   * Prepared version for corresponding to Type 0 fonts. Notice that this
#     version may not yet deal with these fonts.
#
# 0.27	1999.1.22
#   - Add the option to delete a hyphen as a word separator just before newline.
#
# 0.26	1999.1.21
#   - Fix bug in extracting stream data.
#   - Fix bug in extracting string data from the stream decoded.
#
# 0.25	1998.12.17
#   - Use a temporary file when uncompress is called to decompress LZW data.
#     And so IPC::Open2 is not used any longer.
#
# 0.24	1998.11.28
#   - Refer the operators of drawing text data. And so remove '-n' option not
#     to put newline '\n' with the text extracted from PDF files.
#
# 0.23	1998.11.6
#   - Fix bug in extracting stream data.
#
# 0.22	1998.10.31
#   - Fix bug in converting "\(" into "(", "\)" into ")" and "\\" into "\".
#
# 0.21	1998.10.29
#   - Fix bug about $lzw_stream variable.
#
# 0.20	1998.10.29
#   - Convert PDF-format LZW data to compress-format LZW data and then deccode
#     the data by use of UNIX uncompress command. The standard UNIX compress utility
#     uses LZW, but with a slightly different file format. This conversion does *not*
#     decompress the data. It simply converts it to a different file format.
#     pdf2txt then calls uncompress to actually decompress the data.
#     For Unisys's LZW patent, see <http://www.unisys.com/LeadStory/lzwfaq.html>.
#
# 0.12	1998.10.27
#   - Add the option for omitting character code \ddd (octal).
#   - Convert "\(" into "(", "\)" into ")" and "\\" into "\".
#   - Fix bug in extracting string data from the stream decoded.
#
# 0.11	1998.10.15
#   - Add the option for the output on stdout.
#
# 0.10	1998.10.12
#   - Use cross-reference table.
#
# 0.04	1998.10.10
#   - Fix bug in dealing with unknown decode-filters.
#   - Add the option for base85-decoding if length of base85 data not a multiple of 5.
#
# 0.03	1998.10.9
#   - Fix bug in extracting stream data.
#
# 0.02	1998.10.6
#   - Fix bug in processing postscript after decode-filtering.
#
# 0.01	1998.10.5
#   - Initial release.
#
# This tool uses these softwares below.
#
# . zlib		http://www.cdrom.com/pub/infozip/zlib/
# . Compress::Zlib	CPAN
# . uncompress		UNIX uncompress command
# . Jcode		http://openlab.ring.gr.jp/Jcode/
# . Base85.pl		ftp://paprika.noc.intec.co.jp/pub/person/ishida/freeware/pdf2txt/
# . PDFLZW.pl		ftp://paprika.noc.intec.co.jp/pub/person/ishida/freeware/pdf2txt/
# . PDFEncoding.pl	ftp://paprika.noc.intec.co.jp/pub/person/ishida/freeware/pdf2txt/
# . makeCMap.pl		ftp://paprika.noc.intec.co.jp/pub/person/ishida/freeware/pdf2txt/
#
# Supported decoding methods:
#
#   /ASCIIHexDecode
#   /ASCII85Decode	ASCII85Decode filter,
#			Portable Document Format Reference Manual Version 1.2, p.51,
#			27-Nov-1996. Adobe Systems Incorporated.
#   /FlateDecode	RFC1951
#   /LZWDecode		See History: 0.20 above.
#
use Getopt::Std;

getopts('f:l:si');

if ($opt_f >= 1) {
    $opt_f--;
} else {
    $opt_f = 0;
}
if ($opt_l >= 1) {
    $opt_l--;
} else {
    $opt_l = 100000;
}

if ($#ARGV < 0) {
print <<USAGE;
PDF2TXT is a simple tool to extract text from PDF files (not encrypted).
Usage: pdf2txt [options] file...
    options are:
        -f <num>       : first page to extract
        -l <num>       : last page to extract
        -s             : output on stdout
        -i             : document information
(version 0.81 copyright (c) 1998-1999, Shigeru Ishida)
USAGE
exit(1);
}

use Compress::Zlib;
use Jcode;
require "Base85.pl";
require "PDFLZW.pl";
require "PDFEncoding.pl";

print STDERR "loading CMap...\n";
require "CMap.pl";
print STDERR "done.\n";

sleep 1;

# Global variables:
# PDFFH
my($txtfh) = "";
local($pdffile);
my($pdf_start_offset);
my(%xref);
my(%trailer);
my(%info);
my(@pages);
my(%parents);
my(%encodings);
my($cur_encoding);
my($cur_font);
my($cur_cmap);

%name2esc = (
	     "78-H" => "\x1b\x24\x40",
	     "78-V" => "\x1b\x24\x40",
	     "H" => "\x1b\x24\x42",
	     "V" => "\x1b\x24\x42",
	     "Hojo-H" => "\x1b\x24\x28\x44",
	     "Hojo-V" => "\x1b\x24\x28\x44",
	     "Katakana" => "\x1b\x28\x49",
	     "Ext-H" => "\x1b\x24\x42",
	     "Ext-V" => "\x1b\x24\x42",
);

%char2print = (
	Lslash => "L",
	OE => "OE",
	Scaron => "S",
	Ydieresis => "Y",
	Zcaron => "Z",
	ampersand => "\&",
	asciicircum => "\^",
	asciitilde => "\~",
	asterisk => "\*",
	at => "\@",
	backslash => "\\",
	bar => "\|",
	braceleft => "\{",
	braceright => "\}",
	bracketleft => "\[",
	bracketright => "\]",
	bullet => "\*",
	colon => "\:",
	comma => "\,",
	degree => "\-",
	dollar => "\$",
	dotlessi => "i",
	ellipsis => "\.\.\.",
	emdash => "\-",
	endash => "\-",
	equal => "\=",
	exclam => "\!",
	fi => "fi",
	fl => "fl",
	greater => "\>",
	hyphen => "\-",
	less => "\<",
	lslash => "l",
	numbersign => "\#",
	oe => "oe",
	parenleft => "\(",
	parenright => "\)",
	percent => "\%",
	period => "\.",
	plus => "\+",
	question => "\?",
	quotedbl => "\"",
	quotedblbase => "\"",
	quotedblleft => "\"",
	quotedblright => "\"",
	quoteleft => "\`",
	quoteright => "\'",
	quotesinglbase => "\'",
	quotesingle => "\'",
	scaron => "s",
	semicolon => "\;",
	slash => "/",
	space => " ",
	trademark => "TM",
	underscore => "_",
	yen => "\\",
	zcaron => "z",
);

foreach $pdffile (@ARGV) {
    my($txtfile);

    if (!open(PDFFH, "$pdffile")) {
	warn "Can't open $pdffile: $!\n";
	next;
    }
    binmode PDFFH;

    if ($opt_s == 1) {
	$txtfh = STDOUT;
	print STDERR "convert: $pdffile ==> stdout\n";
    } else {
	if ($pdffile =~ /(.*)\.pdf$/i) {
	    $txtfile = "$1.txt";
	} else {
	    $txtfile = "$pdffile.txt";
	}
	if (!open($txtfh, ">$txtfile")) {
	    warn "Can't open $txtfile: $!\n";
	    close PDFFH;
	    next;
	}
	print STDERR "convert: $pdffile ==> $txtfile\n";
    }

    # set start offset (in byte) of PDF content
    &set_start_offset();

    # read tables of xref and trailer
    if (&read_xref_trailer("\r") == 0) {
	&read_xref_trailer("\n");
    }

    # read encrypt information if exists
    if (defined $trailer{'Encrypt'}) {
	warn "  Encrypted. (not process)\n";
	next;
    }

    # read info dictionary
    &read_info();

    if ($opt_i == 1) {
	my($t);
	foreach $t (sort keys(%info)) {
	    print STDERR "  $t: $info{$t}\n";
	}
    }

    # read table of contents
    &read_pages();

    # read name of encording method of subtype
    &read_encodings();

    # proccess contents of pages
    for ($i = 0; $i <= $#pages; $i++) {
	next if $i < $opt_f || $i > $opt_l;

	my(@contents) = ();

	@contents = &read_contents($pages[$i]);

	&set_encoding($pages[$i]);

	for ($j = 0; $j <= $#contents; $j++) {
	    my($obj, $gen) = split /\s+/, $contents[$j];
	    my($data, $filter);
	    local($stream);
	    my(@decodes, $line);

	    $data = &get_object_by_ref($contents[$j]);

	    if ($data =~ /^\d+\s+\d+\s+obj\s+\<\<\s+(.*)\s+\>\>\s+stream\r?\n?(.*)\r?\n?endstream\s+/s) {
		$filter = $1;
		$stream = $2;
	    } else {
		next;
	    }

	    @decodes = &decode_mechod($filter);
	    next if $decodes[0] eq "NO_APPLY";

	    next if &decode_stream(*stream, \@decodes) == 0;

	    &output_stream(*stream);
	}
	print $txtfh "\n\f\n";
    }
    close $txtfh if $opt_s != 1;
    close PDFFH;

    print STDERR "  done.\n";
}

&PDFLZW::unlink_tmpfile();

sub set_start_offset {
    my($fnsize);
    my($fn);
    my($ftype);

    $pdf_start_offset = 0;

    # check if MacBinary II or not
    seek(PDFFH, 1, 0);
    read(PDFFH, $fnsize, 1);
    $fnsize = unpack('C', $fnsize);
    return if $fnsize < 1 || $fnsize > 63;

    seek(PDFFH, 2, 0);
    read(PDFFH, $fn, $fnsize);
    return unless $pdffile =~ /^$fn$/i;

    seek(PDFFH, 65, 0);
    read(PDFFH, $ftype, 4);
    if ($ftype =~ /^pdf/i) {
	$pdf_start_offset = 128;
	return;
    }
}

sub get_object_by_ref {
    my($ref) = @_;
    my($sep, $line);

    seek(PDFFH, $xref{$ref} + $pdf_start_offset, 0);

    $sep = $/;
    $/ = "endobj";
    $line = <PDFFH>;
    $/ = $sep;

    $line;
}

sub get_object_in_seek {
    my($sep, $line);

    $sep = $/;
    $/ = "endobj";
    $line = <PDFFH>;
    $/ = $sep;

    $line;
}

sub parse_object {
    my($line, $list) = @_;
    my($pline, $t);

    if ($line =~ /^\d+\s+\d+\s+obj\s+\<\<\s*(.+)\s*\>\>\s+endobj\s*$/s) {
	$line = $1;
    }

    $line =~ s/\r([\d\]]+)/$1/g;
    $line = "\r" . $line;

    foreach $pline (split /[\r\n]\//, $line) {
	if ($pline =~ /^(\S+)\s+(\d+)\s+(\d+)\s+R\s*/s) {
	    $$list{$1} = "$2 $3";
	} elsif ($pline =~ /^(\S+)\s+(\d+)\s*/s) {
	    $$list{$1} = $2;
	} elsif ($pline =~ /^(\S+)\s+\/(\S+)\s*/s) {
	    $$list{$1} = $2;
	} elsif ($pline =~ /^(\S+)\s*\[\s*(.+)\s*\]\s*/s) {
	    $$list{$1} = $2;
	    $t = \$$list{$1};
	    $$t =~ s/^\s*//;
	    $$t =~ s/\s*$//;
	} elsif ($pline =~ /^(\S+)\s*(\<(.+)\>)\s*/s) {
	    $$list{$1} = $2;
	} elsif ($pline =~ /^(\S+)\s*(\(.+\))\s*/s) {
	    $$list{$1} = $2;
	}
    }
}

sub read_xref_trailer {
    my($sep) = @_;

    %xref = ();
    %trailer = ();

    seek(PDFFH, 0, 0);

    $/ = $sep;
    while (<PDFFH>) {
	chop;
	next if !/^\s*xref\s*$/;

	while (<PDFFH>) {
	    chop;

	    if (/\s*(\d+)\s+(\d+)\s*$/) {
		my($obj_num, $obj_cnt) = ($1, $2);
		my($i, $key);

		for ($i = 0; $i < $obj_cnt; $i++) {
		    $_ = <PDFFH>;
		    if (/\s*(\d+)\s+(\d+)\s+n\s*$/) {
			$key = sprintf "%d", $2;
			$key = $obj_num+$i . " $key";
			$xref{$key} = $1;
		    }
		}
		next;
	    } else {
		last;
	    }
	}

	if (/^\s*trailer(\s*$|\s+\<\<)/) {
	    my($line) = get_object_in_seek();
	    &parse_object($line, \%trailer);
	    if ($trailer{'ID'} =~ /\s*\<([\da-fA-F]+)\>/) {
		$trailer{'ID'} = $1;
		$trailer{'ID'} =~ s/([\dA-F]{2})/pack('C',hex($1))/eig;
	    }
	}
    }

    seek(PDFFH, 0, 0);

    scalar keys(%xref);
}

sub unescape_string {
    my($string) = @_;

    $$string =~ s/\\n/\n/g;
    $$string =~ s/\\r/\r/g;
    $$string =~ s/\\t/\t/g;
    $$string =~ s/\\b/\b/g;
    $$string =~ s/\\f/\f/g;
    $$string =~ s/\\\\/\\/g;
    $$string =~ s/\\\(/\(/g;
    $$string =~ s/\\\)/\)/g;
}

sub add_pages {
    my($ref) = @_;
    my($line);
    my(%list) = ();
    my($pline);
    my(@pages) = ();

    $line = &get_object_by_ref($ref);
    &parse_object($line, \%list);

    if ($list{'Type'} eq "Page") {
	$parents{$ref} = $list{'Parent'} if defined $list{'Parent'};
	push @pages, $ref;
    } else {
	$parents{$ref} = $list{'Parent'} if defined $list{'Parent'};
	foreach $pline (split /R/, $list{'Kids'}) {
	    push @pages, &add_pages("$1 $2") if $pline =~ /^\s*(\d+)\s+(\d+)\s*$/;
	}
    }

    @pages;
}

sub read_info {
    my($obj, $gen) = split /\s+/, $trailer{'Info'};
    my($line, $t);

    %info = ();

    $line = &get_object_by_ref($trailer{'Info'});
    &parse_object($line, \%info);

    foreach $t (keys %info) {
	if ($info{$t} =~ /^\<(.+)\>$/) {
	    $info{$t} = $1;
	    $info{$t} =~ s/([\dA-F]{2})/pack('C',hex($1))/eig;
	} elsif ($info{$t} =~ /^\((.+)\)$/) {
	    $info{$t} = $1;
	    $info{$t} =~ s/\\\r//g;
	}
	$info{$t} =~ s/\\(\d{3})/pack('C',oct($1))/eig;

	&unescape_string(\$info{$t});

	if (($t eq "CreationDate") || ($t eq "ModDate")) {
	    $info{$t} = $1 if $info{$t} =~ /^D:\s*(.*)\s*$/;
	}

	if (($info{$t} =~ /^\xfe\xff(.*)$/) || ($info{$t} =~ /^\xff\xfe(.*)$/)) {
	    $info{$t} = $1;
	    if ($info{$t} =~ /^\x00\x1b(.{2,4})\x00\x1b(.*)$/) {
		my($lc) = $1;
		$info{$t} = $2;
		next if !($lc =~ /^ja/);
	    }
	    $info{$t} = Jcode::convert($info{$t}, euc, ucs2);
	}
    }
}

sub read_pages {
    my($line);
    my(%list) = ();

    @pages = ();
    %parents = ();

    $line = &get_object_by_ref($trailer{'Root'});
    &parse_object($line, \%list);

    @pages = &add_pages($list{'Pages'});
}

sub read_encodings {
    my($i, $line, $pline);

    %encodings = ();

    for ($i = 0; $i <= $#pages; $i++) {
	my(@f) = ();
	my(%fn) = ();
	my($fref);
	my($encoding_p) = {};

	$line = &get_object_by_ref($pages[$i]);
	$line =~ s/[\r\n]/ /g;

	if ($line =~ /\/Resources\s+(\d+)\s+(\d+)\s+R/) {
	    $pline = &get_object_by_ref("$1 $2");
	    $pline =~ s/[\r\n]/ /g;
	} elsif ($line =~ /\/Resources\s+\<\<\s+(.+)/) {
	    $pline = $1;
	} else {
	    next;
	}

	if ($pline =~ /\/Font\s+(\d+)\s+(\d+)\s+R/) {
	    $pline = &get_object_by_ref("$1 $2");
	    $pline =~ s/[\r\n]/ /g;

	    if ($pline =~ /\s+\<\<\s+([\da-zA-Z\/\s]+)\s+\>\>/) {
		my($pline2) = $1;
		my($t);

		foreach $t (split /R/, $pline2) {
		    if ($t =~ /\/(\S+)\s+(\d+)\s+(\d+)\s*$/) {
			push @f, "$2 $3";
			$fn{"$2 $3"} = $1;
		    }
		}
	    } else {
		next;
	    }
	} elsif ($pline =~ /\/Font\s+\<\<\s+([\da-zA-Z\-\/\s]+)\s+\>\>/) {
	    my($pline2) = $1;
	    my($t);

	    foreach $t (split /R/, $pline2) {
		if ($t =~ /\/(\S+)\s+(\d+)\s+(\d+)\s*$/) {
		    push @f, "$2 $3";
		    $fn{"$2 $3"} = $1;
		}
	    }
	} else {
	    next;
	}

	foreach $fref (@f) {
	    my($fline);
	    my($subtype, $enc);
	    my($basefont);
	    my($eline);
	    my($differences) = "";
	    my(@enc_array) = ();

	    $fline = &get_object_by_ref($fref);
	    $fline =~ s/[\r\n]/ /g;

	    if ($fline =~ /\/Subtype\s+\/(\S+)\s/) {
		$subtype = $1;
	    } else {
		next;
	    }

	    if ($fline =~ /\/BaseFont\s+\/(\S+)\s/) {
		$basefont = $1;
	    } else {
		$basefont = "";
	    }

	    if ($fline =~ /\/Encoding\s+\/(\S+)\s/) {
		$enc = $1;
	    } else {
		if ($fline =~ /\/Encoding\s+(\d+)\s+(\d+)\s+R/) {
		    $eline = &get_object_by_ref("$1 $2");
		    $eline =~ s/[\r\n]/ /g;
		} elsif ($fline =~ /\/Encoding\s+\<\<\s+([\da-zA-Z\/\s]+)\s+\>\>/) {
		    $eline = $1;
		} else {
		    next;
		}

		if ($eline =~ /\BaseEncoding\s+\/(\S+)\s/) {
		    $enc = $1;
		} else {
		    if (&PDFEncoding::check_encoding_by_basefont($basefont) == 1) {
			$enc = &PDFEncoding::get_name_by_basefont($basefont);
		    } else {
			$enc = "";
		    }
		}
	    }

	    if ($eline =~ /\/Differences\s*\[\s*([\da-zA-Z\/\s]+)\s*\]/) {
		$differences = $1;
		$differences =~ s/[\/\r\n\s]+/ /g;
	    }

	    if (&PDFEncoding::check_encoding($enc) == 1) {
		my($code, $var) = (0, "");

		@enc_array = &PDFEncoding::get_encoding($enc);

		foreach $var (split /\s+/, $differences) {
		    if ($var =~ /^(\d+)$/) {
			$code = $var;
		    } else {
			$enc_array[$code++] = $var;
		    }
		}
	    }

	    if (defined $fn{$fref}) {
		$$encoding_p{$fn{$fref}} = {subtype => $subtype, encoding => $enc, encoding_array => [@enc_array]};
	    } elsif ($fline =~ /\/Name\s+\/(\S+)\s/) {
		$$encoding_p{$1} = {subtype => $subtype, encoding => $enc, encoding_array => [@enc_array]};
	    }
	}

	$encodings{$pages[$i]} = $encoding_p;
    }
}

sub add_contents {
    my($ref) = @_;
    my($line, $tline, $pline);
    my(%list) = ();
    my(@contents) = ();

    $line = &get_object_by_ref($ref);
    if ($line =~ /^\d+\s+\d+\s+obj\s+\[\s*(.+)\s*\]\s+endobj\s*$/s) {
	$tline = $1;
	foreach $pline (split /R/, $tline) {
	    if ($pline =~ /^\s*(\d+)\s+(\d+)\s*$/) {
		push @contents, "$1 $2";
	    }
	}
    } else {
	push @contents, $ref;
    }

    @contents;
}

sub read_contents {
    my($ref) = @_;
    my($line);
    my(%list) = ();
    my(@contents) = ();

    $line = &get_object_by_ref($ref);
    &parse_object($line, \%list);

    foreach $pline (split /R/, $list{'Contents'}) {
	if ($pline =~ /^\s*(\d+)\s+(\d+)\s*$/) {
	    push @contents, &add_contents("$1 $2");
	}
    }

    @contents;
}

sub decode_mechod  {
    local($line) = @_;
    my($filter) = ("NO_APPLY");

    if ($line =~ /\s*\/Filter\s+\[\s*(.*)\s*\]\s/) {
	$filter = $1;
    } elsif ($line =~ /\s*\/Filter\s+(\S+)/) {
	$filter = $1;
    } else {
	return ();
    }

    if ($line =~ /\s*\/Type\s+\/(.*)\s+/) {
	if ($1 =~ /XObject/) {
	    $filter = "NO_APPLY";
	}
    }
    if ($line =~ /\s*\/Subtype\s+\/(.*)\s+/) {
	if ($1 =~ /Image/) {
	    $filter = "NO_APPLY";
	}
    }
    if ($line =~ /\s*\/Width\s+/) {
	$filter = "NO_APPLY";
    }
    if ($line =~ /\s*\/Height\s+/) {
	$filter = "NO_APPLY";
    }
    if ($line =~ /\s*\/BitsPerComponent\s+/) {
	$filter = "NO_APPLY";
    }
    if ($line =~ /\s*\/ColorSpace\s+/) {
	$filter = "NO_APPLY";
    }

    split(/\s+/, $filter);
}

sub inflate {
    local($input) = @_;
    local($x, $output, $status) = ('', '', '');

    if (!($x = inflateInit())) {
	warn "Cannot create a inflation stream\n";
	return "";
    }
    ($output, $status) = $x->inflate(\$input);

    if ($status != Z_OK && $status != Z_STREAM_END) {
	warn "inflation failed\n";
	return "";
    }
    $output;
}

sub decode_stream {
    local(*stream, $decodes) = @_;
    my($unknown, $decode_cnt) = (0, 0);

    foreach $filter (@$decodes) {
	if ($filter eq "/ASCIIHexDecode") {
	    $decode_cnt++;
	    $stream =~ s/([\dA-F]{2})/pack('C',hex($1))/eig;
	    last if length($stream) == 0;
	} elsif ($filter eq "/ASCII85Decode") {
	    last if $#$decodes+1 == 1;
	    $decode_cnt++;
	    $stream = &Base85::decode($stream);
	    last if length($stream) == 0;
	} elsif ($filter eq "/FlateDecode") {
	    $decode_cnt++;
	    $stream = &inflate($stream);
	    last if length($stream) == 0;
	} elsif ($filter eq "/LZWDecode") {
	    $decode_cnt++;
	    $stream = &PDFLZW::uncompress($stream);
	    last if length($stream) == 0;
	} else {
	    if ($decode_cnt == 0) {
		$unknown = 1;
		last;
	    }
	}
    }

    if ($unknown == 0) {
	return 1;
    } else {
	return 0;
    }
}

sub output_stream {
    local(*stream) = @_;
    my(@text_contents) = ();
    my($line);
    my($bt) = 0;
    local($ly) = 0;
    local(@tm) = (1, 0, 0, 1, 0, 0);
    my($fontsize) = 1;
    my($x_prev, $y_prev) = (0, 0);
    my($t);

    foreach $line (split(/[\r\n]/, $stream)) {

	if ($bt == 0) {
	    if ($line =~ /^\s*BT\s*$/) {
		$bt = 1;
		next;
	    }
	}
	if ($line =~ /^\s*ET\s*$/) {
	    $bt = 0;
	    next;
	}
	next if $line =~ /^\s*$/;

	if ($line =~ /\s*\/(\S+)\s+([\d\.]+)\s+Tf\s*/) {
	    &set_cmap($1);
	    $fontsize = $2;
	}

	&set_matrix($line, \$ly, \@tm);

	$line = &extract_text($line);
	next if length($line) == 0;

	push @text_contents, {x => $tm[4], y => $tm[5], fs => $fontsize, d => $line};
    }

    foreach $t (@text_contents) {
	if (abs($t->{y}-$y_prev) >= 1) {
	    print $txtfh "\n";
	} elsif (($t->{x}-$x_prev)/$t->{fs} >= 1) {
	    print $txtfh " ";
	}
	print $txtfh $t->{d};
	$x_prev = $t->{x};
	$y_prev = $t->{y};
    }
}

sub set_matrix {
    my($line, $ly, $tm) = @_;
    my($x, $y);

    if ($line =~ /\s*T\*\s*/) {
	$x = $$tm[2]*$$ly+$$tm[4];
	$y = $$tm[3]*$$ly+$$tm[5];
	$$tm[4] = $x;
	$$tm[5] = $y;
    } elsif ($line =~ /\s*\'\s*$/) {
	$x = $$tm[2]*$$ly+$$tm[4];
	$y = $$tm[3]*$$ly+$$tm[5];
	$$tm[4] = $x;
	$$tm[5] = $y;
    } elsif ($line =~ /\s*\"\s*$/) {
	$x = $$tm[2]*$$ly+$$tm[4];
	$y = $$tm[3]*$$ly+$$tm[5];
	$$tm[4] = $x;
	$$tm[5] = $y;
    } elsif ($line =~ /\s*([\d\-\+\.]+)\s+TL\s*/) {
	$$ly = $1;
    } elsif ($line =~ /\s*([\d\-\+\.]+)\s+([\d\-\+\.]+)\s+Td\s*/) {
	$x = $$tm[0]*$1+$$tm[2]*$2+$$tm[4];
	$y = $$tm[1]*$1+$$tm[3]*$2+$$tm[5];
	$$tm[4] = $x;
	$$tm[5] = $y;
    } elsif ($line =~ /\s*([\d\-\+\.]+)\s+([\d\-\+\.]+)\s+TD\s*/) {
	$$ly = -$2;
	$x = $$tm[0]*$1+$$tm[2]*$2+$$tm[4];
	$y = $$tm[1]*$1+$$tm[3]*$2+$$tm[5];
	$$tm[4] = $x;
	$$tm[5] = $y;
    } elsif ($line =~ /\s*([\d\-\+\.]+)\s+([\d\-\+\.]+)\s+([\d\-\+\.]+)\s+([\d\-\+\.]+)\s+([\d\-\+\.]+)\s+([\d\-\+\.]+)\s+Tm\s*/) {
	$$tm[0] = $1;
	$$tm[1] = $2;
	$$tm[2] = $3;
	$$tm[3] = $4;
	$$tm[4] = $5;
	$$tm[5] = $6;
    }
}

sub extract_text {
    my($line) = @_;
    local($data) = "";

    if ($line =~ /\[(.*)\]\s*TJ/) {
        $line = $1;
	$line =~ s/^\s*//;
	$line =~ s/\s*$//;
	$line =~ s/^[\d\-\.]+\s*//;
	$line =~ s/\>\s*([\d\-\.]+)\s*\</\> $1 \</g;
	$line =~ s/(\)\s*)([\d\-\.]+)(\s*\()/&add_space_in_word($2)/eg;
    } elsif ($line =~ /\<(.*)\>\s*Tj/) {
	$line = "<$1>";
    } elsif ($line =~ /\((.*)\)\s*Tj/) {
	$line = "($1)";
    } else {
	return "";
    }

    if ($line =~ /^\s*(\<.*\>)\s*$/) {
	my($pline) = $1;
	my($t);

	foreach $t (split /\s+/, $pline) {
	    if ($t =~ /^\<(.*)\>$/) {
		$t = $1;
		$t =~ s/([\dA-F]{2})/pack('C',hex($1))/eig;
		&cid2code(\$t);
	    } elsif ($t =~ /^[\d\-\.]+$/) {
		$t = &add_space_in_word($t);
	    } else {
		next;
	    }
	    $data .= $t;
	}
    } elsif ($line =~ /\((\\[\d\\]+)\)$/) {
        $data = $1;
        $data =~ s/\\(\d{3})/pack('C',oct($1))/eig;
    } elsif ($line =~ /\((.*)\)$/) {
        $data = $1;
	&unescape_string(\$data);
	$data =~ s/([\x00-\x1f\x7f-\xff])/&decode_charcode(unpack('C', $1))/eg;
	$data =~ s/\\(\d{3})/&decode_charcode(oct($1))/eg;
    } else {
        return "";
    }

    if (&check_type("Type3") == 1) {
	$data =~ s/[\x00-\x1f]/ /g;
    }

    $data;
}

sub check_type {
    my($type) = @_;
    my($p) = $$cur_encoding{$cur_font};

    return -1 unless defined $p;
    if ($p->{subtype} eq $type) {
	return 1;
    } else {
	return 0;
    }
}

sub add_space_in_word {
    if (-1 >= int($_[0]*13/2000)) {
	return " ";
    } else {
	return "";
    }
}

sub set_encoding {
    my($ref) = @_;
    my($encoding_p) = "";

    $cur_encoding = "";
    $cur_cmap = "";

    while (1) {
	$encoding_p = $encodings{$ref};
	if (defined $encoding_p) {
	    $cur_encoding = $encoding_p;
	    return 1;
	} else {
	    $ref = $parents{$ref};
	    return 0 unless defined $ref;
	    next;
	}
    }
}

sub set_cmap {
    ($cur_font) = @_;
    my($p) = $$cur_encoding{$cur_font};

    $cur_encoding_array = $p->{encoding_array};

    $cur_cmap = $p->{encoding};
    $cur_cmap =~ s/^Identity-(\S+)\s*/$1/;

    if ($p->{subtype} eq "Type0") {
	return &CMap::set_cmap($cur_cmap);
    } else {
	return -1;
    }
}

sub cid2code {
    my($string) = @_;
    my($p) = $$cur_encoding{$cur_font};
    my($data) = $$string;

    return -1 unless defined $p;
    return -2 if $p->{subtype} ne "Type0";

    if (defined $name2esc{$cur_cmap}) {
	$$string = "";

	while (length($data) >= 2) {

	    if ($data =~ /^(([\x01-\xff][\x00-\xff])+)/) {
		my($pdata) = $1;
		my($len) = 0;

		substr($data, 0, length($pdata)) = "";

		while (length($pdata) > 0){
		    if (&_cid2code($string, \$pdata, \$len, $cur_cmap) < 0 &&
			&_cid2code($string, \$pdata, \$len, "Katakana") < 0 &&
			&_cid2code($string, \$pdata, \$len, "Hankaku") < 0 &&
			&_cid2code($string, \$pdata, \$len, "Ext-H") < 0) {
			if (length($pdata) >= 2) {
			    $$string .= substr($pdata, 0, 2);
			    substr($pdata, 0, 2) = "";
			    next;
			} else {
			    $$string .= $pdata;
			    last;
			}
		    }
		}
	    }elsif ($data =~ /^((\x00[\x00-\xff])+)/) {
		my($pdata) = $1;
		my($len) = 0;

		substr($data, 0, length($pdata)) = "";

		$pdata =~ s/\x00//g;

		while (length($pdata) > 0){
		    if (($pdata =~ /^[\x80-\xff]/ ||
			 &_cid2code($string, \$pdata, \$len, "UniJIS-UCS2-H") < 0) &&
			&_cid2code($string, \$pdata, \$len, "Katakana") < 0 &&
			&_cid2code($string, \$pdata, \$len, "Hankaku") < 0 &&
			&_cid2code($string, \$pdata, \$len, "Ext-H") < 0) {
			if (length($pdata) >= 2) {
			    $$string .= substr($pdata, 0, 2);
			    substr($pdata, 0, 2) = "";
			    next;
			} else {
			    $$string .= $pdata;
			    last;
			}
		    }
		}
	    }
	}

	$$string .= $data;
	return 0;
    } else {
	my($len) = 0;
	my($ret);

	$ret = &CMap::cid2code(\$data, \$len);
	if ($ret == 1 || $ret == 0) {
	    $$string = $data;
	    return 1;
	} else {
	    return $ret;
	}
    }
}

sub decode_charcode {
    my($code) = @_;
    my($print) = $char2print{$$cur_encoding_array[$code]};

    if (defined $print) {
	return $print;
    } else {
	return pack('C', $code);
    }
}

sub _cid2code {
    my($string, $_pdata, $_len, $cmap) = @_;
    my $esc = $name2esc{$cmap};
    my $esc_ascii = $esc ? "\x1b\x28\x4a" : '';

    &CMap::set_cmap($cmap);
    my $ret = &CMap::cid2code($_pdata, $_len);

    my $conv = substr($$_pdata, 0, $$_len);
    substr($$_pdata, 0, $$_len) = "";

    $conv =~ s/\x00//g;

    if ($ret == 1){
        $$string .= $esc . $conv . $esc_ascii;
        return 1;
    }elsif ($ret == 0 && $$_len > 0){
        $$string .= $esc . $conv . $esc_ascii;
        return 0;
    }
    -1;
}
