Code examples from 'Internet Forensics'
Chapter 8 (File Contents)

Example 8-1: superstrings.pl
#!/usr/bin/perl -w
# Example 8-1: superstrings.pl

# Excerpted from 'Internet Forensics' by Robert Jones
# Published 2005 by O'Reilly Media (ISBN 0-596-10006-X)

die "Usage: $0 <word doc>\n" unless @ARGV == 1;
my %hash = ();
foreach my $encoding ('s', 'b', 'B', 'l', 'L') {
   my $text = `strings -td -e$encoding $ARGV[0]`;
   foreach my $line (split /\n/, $text) {
      if($line =~ /^\s*(\d+)\s+(.*)$/) {
         $hash{$1} = $2;
      }
   }
}

foreach my $offset (sort { $a <=> $b } keys %hash) {
    printf "%s\n", $hash{$offset};
}

Example 8-2: compare_text.pl
#!/usr/bin/perl -w
# Example 8-2: compare_text.pl

# Excerpted from 'Internet Forensics' by Robert Jones
# Published 2005 by O'Reilly Media (ISBN 0-596-10006-X)

die "Usage: $0 <file1> <file2>\n" unless @ARGV == 2;

my $minscore = 5;
my @words0 = ();
my @words1 = ();

loadWords($ARGV[0], \@words0);
loadWords($ARGV[1], \@words1);

my %segment = ();
my $score = 0;
my $maxscore = 0;
my $maxi0 = 0;
my $maxi1 = 0;

for(my $i0 = 0; $i0 < @words0; $i0++) {
   my $word0 = $words0[$i0];
   for(my $i1 = 0; $i1 < @words1; $i1++) {
      if(lc $words1[$i1] eq lc $word0) {
         ($maxscore, $maxi0, $maxi1) = 
              matchDiagonal(\@words0, \@words1, $i0, $i1);

         if(exists $segment{$maxi0}{$maxi1}) {
            if($maxscore > $segment{$maxi0}{$maxi1}){
               $segment{$maxi0}{$maxi1} = $maxscore;
            }
         } else {
            $segment{$maxi0}{$maxi1} = $maxscore;
         }
      }
   }
}

foreach my $maxi0 (sort keys %segment) {
   foreach my $maxi1(sort keys %{$segment{$maxi0}}) {
      $maxscore = $segment{$maxi0}{$maxi1};
      if($maxscore >= $minscore) {
		printf "%s\n\n", 
         traceBack(\@words0, \@words1, $maxi0, $maxi1, $maxscore);
      }
   }
}


sub matchDiagonal {
   # Extend an initial word match along both word arrays
   my ($words0, $words1, $i0, $i1) = @_;
   my $maxscore = 0;
   my $maxi0 = $i0;
   my $maxi1 = $i1;
   my $score = 0;
   my $j1 = $i1;
   for(my $j0 = $i0; $j0 < @$words0; $j0++) {
       if(lc $words0->[$j0] eq lc $words1->[$j1]) {
           $score++;
           if($score > $maxscore) {
               $maxscore = $score;
               $maxi0 = $j0;
               $maxi1 = $j1;
           }
       } else {
           $score--;
       }
       if($score < 0) {
           $score = 0;
           last;
       }
       $j1++;
       last if($j1 >= @$words1);
   }
   ($maxscore, $maxi0, $maxi1);
}


sub traceBack {
   # Trace back from the maximum score to reconstruct the matching string
   my ($words0, $words1, $maxi0, $maxi1, $score) = @_;
   my @array0 = ();
   my @array1 = ();
   my $i1 = $maxi1;
   for(my $i0 = $maxi0; $i0 >= 0; $i0--) {
       push @array0, $words0[$i0];
       push @array1, $words1[$i1];
       if(lc $words0[$i0] eq lc $words1[$i1]) {
           $score--;
       }
       last if($score == 0);
       $i1--;
       last if($i1 < 0);
   }

   my @array = ();
   for(my $i=0; $i<@array0; $i++) {
      if(lc $array0[$i] eq lc $array1[$i]) {
          push @array, $array0[$i];
      } else {
          push @array, sprintf "((%s/%s))", $array0[$i], $array1[$i];
      }
   }
   join ' ', reverse @array;
}


sub loadWords {
   # Read in the text word by word - skip short words
   my ($filename, $words) = @_;
   my $minsize = 4;
   open INPUT, "< $filename" or die "Unable to open file: $filename\n";
   while(<INPUT>) {
     $_ =~ s/[^a-zA-Z0-9]+/ /g;
     $_ =~ s/^\s+//;
     foreach my $word (split /\s+/, $_) {
       if(length $word >= $minsize) {
           push @$words, $word;
       }
     }
   }
   close INPUT;
}