This file is indexed.

/usr/share/popfile/Classifier/WordMangle.pm is in popfile 1.1.3+dfsg-0ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# POPFILE LOADABLE MODULE
package Classifier::WordMangle;

use POPFile::Module;
@ISA = ("POPFile::Module");

# ----------------------------------------------------------------------------
#
# WordMangle.pm --- Mangle words for better classification
#
# Copyright (c) 2001-2011 John Graham-Cumming
#
#   This file is part of POPFile
#
#   POPFile is free software; you can redistribute it and/or modify it
#   under the terms of version 2 of the GNU General Public License as
#   published by the Free Software Foundation.
#
#   POPFile is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with POPFile; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# ----------------------------------------------------------------------------

use strict;
use warnings;
use locale;

# These are used for Japanese support

my $ascii = '[\x00-\x7F]'; # ASCII chars
my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
my $three_bytes_euc_jp = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3bytes EUC-JP chars
my $euc_jp = "(?:$ascii|$two_bytes_euc_jp|$three_bytes_euc_jp)"; # EUC-JP chars

#----------------------------------------------------------------------------
# new
#
#   Class new() function
#----------------------------------------------------------------------------

sub new
{
    my $type = shift;
    my $self = POPFile::Module->new();

    $self->{stop__} = {};

    bless $self, $type;

    $self->name( 'wordmangle' );

    return $self;
}

sub start
{
    my ( $self ) = @_;

    $self->load_stopwords();

    return 1;
}

# ----------------------------------------------------------------------------
#
# load_stopwords, save_stopwords - load and save the stop word list in the stopwords file
#
# ----------------------------------------------------------------------------
sub load_stopwords
{
    my ($self) = @_;

    if ( open STOPS, '<' . $self->get_user_path_( 'stopwords' ) ) {
        delete $self->{stop__};
        while ( <STOPS> ) {
            s/[\r\n]//g;
            $self->{stop__}{$_} = 1;
        }

        close STOPS;
    } else { 
        $self->log_( 0, "Failed to open stopwords file" );
    }
}

sub save_stopwords
{
    my ($self) = @_;

    if ( open STOPS, '>' . $self->get_user_path_( 'stopwords' ) ) {
        for my $word (keys %{$self->{stop__}}) {
            print STOPS "$word\n";
        }

        close STOPS;
    }
}

# ----------------------------------------------------------------------------
#
# mangle
#
# Mangles a word into either the empty string to indicate that the word should be ignored
# or the canonical form
#
# $word         The word to either mangle into a nice form, or return empty string if this word
#               is to be ignored
# $allow_colon  Set to any value allows : inside a word, this is used when mangle is used
#               while loading the corpus in Bayes.pm but is not used anywhere else, the colon
#               is used as a separator to indicate special words found in certain lines
#               of the mail header
#
# $ignore_stops If defined ignores the stop word list
#
# ----------------------------------------------------------------------------
sub mangle
{
    my ($self, $word, $allow_colon, $ignore_stops) = @_;

    # All words are treated as lowercase

    my $lcword = lc($word);

    return '' unless $lcword;

    # Stop words are ignored

    return '' if ( ( ( $self->{stop__}{$lcword} ) ||   # PROFILE BLOCK START
                     ( $self->{stop__}{$word} ) ) &&
                   ( !defined( $ignore_stops ) ) );    # PROFILE BLOCK STOP

    # Remove characters that would mess up a Perl regexp and replace with .

    $lcword =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.|\\)/\./g;

    # Long words are ignored also

    return '' if ( length($lcword) > 45 );

    # Ditch long hex numbers

    return '' if ( $lcword =~ /^[A-F0-9]{8,}$/i );

    # Colons are forbidden inside words, we should never get passed a word
    # with a colon in it here, but if we do then we strip the colon.  The colon
    # is used as a separator between a special identifier and a word, see MailParse.pm
    # for more details

    $lcword =~ s/://g if ( !defined( $allow_colon ) );

    return ($lcword =~ /:/ )?$word:$lcword;
}

# ----------------------------------------------------------------------------
#
# add_stopword, remove_stopword
#
# Adds or removes a stop word
#
# $stopword    The word to add or remove
# $lang        The current language
#
# Returns 1 if successful, or 0 for a bad stop word
# ----------------------------------------------------------------------------

sub add_stopword
{
    my ( $self, $stopword, $lang ) = @_;

    # In Japanese mode, reject non EUC Japanese characters.

    if ( $lang eq 'Nihongo') {
        if ( $stopword !~ /^($euc_jp)+$/o ) {
            return 0;
        }
    } else {
        if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:alpha:]\-_\.\@0-9]/i ) ) {
            return 0;
        }
    }

    $stopword = $self->mangle( $stopword, 1, 1 );

    if ( $stopword ne '' ) {
        $self->{stop__}{$stopword} = 1;
        $self->save_stopwords();

       return 1;
    }

    return 0;
}

sub remove_stopword
{
    my ( $self, $stopword, $lang ) = @_;

    # In Japanese mode, reject non EUC Japanese characters.

    if ( $lang eq 'Nihongo') {
        if ( $stopword !~ /^($euc_jp)+$/o ) {
            return 0;
        }
    } else {
        if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:alpha:]\-_\.\@0-9]/i ) ) {
            return 0;
        }
    }

    $stopword = $self->mangle( $stopword, 1, 1 );

    if ( $stopword ne '' ) {
        delete $self->{stop__}{$stopword};
        $self->save_stopwords();

        return 1;
    }

    return 0;
}

# GETTER/SETTERS

sub stopwords
{
    my ( $self, $value ) = @_;

    if ( defined( $value ) ) {
        %{$self->{stop__}} = %{$value};
    }

    return keys %{$self->{stop__}};
}

1;