Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Requirements

You will need convert giftopnm and jpegtopnm (imagemagicknetpbm) and gocr installed.

Additionally, you will need the perl module

...

No Format
# FuzzyOcr plugin, version 1
# written by Christian Holler decoder_at_own-hero_dot_net

package FuzzyOcr;

use strict;
use Mail::SpamAssassin;
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Plugin;

use String::Approx 'adistr';

our @ISA = qw (Mail::SpamAssassin::Plugin);

our @words = ( );
our $cnt = 0;

# Default values
our $treshold = "0.3";
our $countreq = 2;

# constructor: register the eval rule
sub new {
   my ( $class, $mailsa ) = @_;
   $class = ref($class) || $class;
   my $self = $class->SUPER::new($mailsa);
   bless( $self, $class );
   $self->register_eval_rule("check_fuzzy_ocr");
   return $self;
}

sub parse_config {
  my ($self, $opts) = @_;
  if ($opts->{key} eq "focr_word") {
        push(@words, $opts->{value});
  } elsif ($opts->{key} eq "focr_treshold") {
        $treshold = $opts->{value};
  } elsif ($opts->{key} eq "focr_counts_required") {
        $countreq = $opts->{value};
  }
}

sub check_fuzzy_ocr {
   my ( $self, $pms ) = @_;
   $cnt = 0;
   foreach my $p ( $pms->{msg}->find_parts("image") ) {
      my ( $ctype, $boundary, $charset, $name ) =
        Mail::SpamAssassin::Util::parse_content_type(
         $p->get_header('content-type') );
      if (($ctype eq "image/gif") || ($ctype eq "image/jpeg")) {
         if ($ctype eq "image/gif") {
                 open OCR, "|/usr/bin/giftopnm - |/usr/bin/convertgocr -i - pnm:- > /tmp/spamassassin.focr.$$";
         } else {
                 open OCR, "|/usr/bin/jpegtopnm - |/usr/bin/gocr -i - > /tmp/spamassassin.focr.$$";
         }
         foreach $p ( $p->decode() ) {
            print OCR $p;
         }
         close OCR;
         open OCR, "/tmp/spamassassin.focr.$$";
         while (<OCR>) {
            s/[^a-zA-Z ]//g;
            $_ = lc;
            my $w;
            foreach $w (@words) {
                $w = lc $w;
                my $matched = adistr($w, $_);
                if (abs($matched) < $treshold) {
                        $cnt++;
                }
            }
         }
         close OCR;
         unlink "/tmp/spamassassin.focr.$$";
      }
   }
   if ($cnt >= $countreq) {
         my $score = 4 + ($cnt - $countreq);
         $pms->_handle_hit("FUZZY_OCR", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR}." ($cnt word occurrences found)");
   }
   return 0;
}

1;