Lucene++ - a full-featured, c++ search engine
API Documentation


Loading...
Searching...
No Matches
CharTokenizer.h
Go to the documentation of this file.
1
2// Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3// Distributable under the terms of either the Apache License (Version 2.0)
4// or the GNU Lesser General Public License.
6
7#ifndef CHARTOKENIZER_H
8#define CHARTOKENIZER_H
9
10#include "Tokenizer.h"
11
12namespace Lucene {
13
15class LPPAPI CharTokenizer : public Tokenizer {
16public:
17 CharTokenizer(const ReaderPtr& input);
18 CharTokenizer(const AttributeSourcePtr& source, const ReaderPtr& input);
19 CharTokenizer(const AttributeFactoryPtr& factory, const ReaderPtr& input);
20 virtual ~CharTokenizer();
21
23
24protected:
25 int32_t offset;
26 int32_t bufferIndex;
27 int32_t dataLen;
28
29 static const int32_t MAX_WORD_LEN;
30 static const int32_t IO_BUFFER_SIZE;
31
32 CharArray ioBuffer;
35
36public:
37 virtual bool incrementToken();
38 virtual void end();
39 virtual void reset(const ReaderPtr& input);
40
41protected:
45 virtual bool isTokenChar(wchar_t c) = 0;
46
49 virtual wchar_t normalize(wchar_t c);
50};
51
52}
53
54#endif
#define LUCENE_CLASS(Name)
Definition LuceneObject.h:24
An abstract base class for simple, character-oriented tokenizers.
Definition CharTokenizer.h:15
virtual void reset(const ReaderPtr &input)
Reset the tokenizer to a new reader. Typically, an analyzer (in its reusableTokenStream method) will ...
CharTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input)
CharArray ioBuffer
Definition CharTokenizer.h:32
OffsetAttributePtr offsetAtt
Definition CharTokenizer.h:34
CharTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input)
virtual void end()
This method is called by the consumer after the last token has been consumed, after incrementToken() ...
static const int32_t IO_BUFFER_SIZE
Definition CharTokenizer.h:30
virtual bool isTokenChar(wchar_t c)=0
Returns true if a character should be included in a token. This tokenizer generates as tokens adjacen...
TermAttributePtr termAtt
Definition CharTokenizer.h:33
static const int32_t MAX_WORD_LEN
Definition CharTokenizer.h:29
int32_t offset
Definition CharTokenizer.h:25
int32_t dataLen
Definition CharTokenizer.h:27
virtual wchar_t normalize(wchar_t c)
Called on each token character to normalize it before it is added to the token. The default implement...
virtual bool incrementToken()
Consumers (ie., IndexWriter) use this method to advance the stream to the next token....
CharTokenizer(const ReaderPtr &input)
int32_t bufferIndex
Definition CharTokenizer.h:26
A Tokenizer is a TokenStream whose input is a Reader.
Definition Tokenizer.h:20
Definition AbstractAllTermDocs.h:12
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition LuceneTypes.h:520
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition LuceneTypes.h:58
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition LuceneTypes.h:40
boost::shared_ptr< Reader > ReaderPtr
Definition LuceneTypes.h:547
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition LuceneTypes.h:519

clucene.sourceforge.net