/usr/include/CLucene/analysis/cjk/CJKAnalyzer.h is in libclucene-dev 2.3.3.4-4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | /*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_analysis_cjk_cjkanalyzer_
#define _lucene_analysis_cjk_cjkanalyzer_
#include "CLucene/analysis/AnalysisHeader.h"
CL_NS_DEF2(analysis,cjk)
/**
* CJKTokenizer was modified from StopTokenizer which does a decent job for
* most European languages. It performs other token methods for double-byte
* Characters: the token will return at each two charactors with overlap match.<br>
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
* also need filter filter zero length token ""<br>
* for Digit: digit, '+', '#' will token as letter<br>
* for more info on Asia language(Chinese Japanese Korean) text segmentation:
* please search <a
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
*
* @author Che, Dong
*/
class CLUCENE_CONTRIBS_EXPORT CJKTokenizer: public CL_NS(analysis)::Tokenizer {
private:
/** word offset, used to imply which character(in ) is parsed */
int32_t offset;
/** the index used only for ioBuffer */
int32_t bufferIndex;
/** data length */
int32_t dataLen;
/**
* character buffer, store the characters which are used to compose <br>
* the returned Token
*/
TCHAR buffer[LUCENE_MAX_WORD_LEN];
/**
* I/O buffer, used to store the content of the input(one of the <br>
* members of Tokenizer)
*/
const TCHAR* ioBuffer;
/** word type: single=>ASCII double=>non-ASCII word=>default */
const TCHAR* tokenType;
static const TCHAR* tokenTypeSingle;
static const TCHAR* tokenTypeDouble;
/**
* tag: previous character is a cached double-byte character "C1C2C3C4"
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
*/
bool preIsTokened;
bool ignoreSurrogates;
public:
/**
* Construct a token stream processing the given input.
*
* @param in I/O reader
*/
CJKTokenizer(CL_NS(util)::Reader* in);
/**
* Returns the next token in the stream, or null at EOS.
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
* for detail.
*
* @return Token
*
* @throws java.io.IOException - throw IOException when read error <br>
* hanppened in the InputStream
*
*/
CL_NS(analysis)::Token* next(CL_NS(analysis)::Token* token);
bool getIgnoreSurrogates(){ return ignoreSurrogates; };
void setIgnoreSurrogates(bool ignoreSurrogates){ this->ignoreSurrogates = ignoreSurrogates; };
};
CL_NS_END2
#endif
|