Lucene++ - a full-featured, c++ search engine
API Documentation


Loading...
Searching...
No Matches
StandardTokenizer.h
Go to the documentation of this file.
1
2// Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3// Distributable under the terms of either the Apache License (Version 2.0)
4// or the GNU Lesser General Public License.
6
7#ifndef STANDARDTOKENIZER_H
8#define STANDARDTOKENIZER_H
9
10#include "Tokenizer.h"
11
12namespace Lucene {
13
34class LPPAPI StandardTokenizer : public Tokenizer {
35public:
39
41 StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeSourcePtr& source, const ReaderPtr& input);
42
44 StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeFactoryPtr& factory, const ReaderPtr& input);
45
47
49
50protected:
53
56
57 // this tokenizer generates three attributes: offset, positionIncrement and type
62
63public:
64 static const int32_t ALPHANUM;
65 static const int32_t APOSTROPHE;
66 static const int32_t ACRONYM;
67 static const int32_t COMPANY;
68 static const int32_t EMAIL;
69 static const int32_t HOST;
70 static const int32_t NUM;
71 static const int32_t CJ;
72
74 static const int32_t ACRONYM_DEP;
75
78
79protected:
80 void init(const ReaderPtr& input, LuceneVersion::Version matchVersion);
81
82public:
84 void setMaxTokenLength(int32_t length);
85
88
90 virtual bool incrementToken();
91
92 virtual void end();
93
94 virtual void reset(const ReaderPtr& input);
95
99
102 void setReplaceInvalidAcronym(bool replaceInvalidAcronym);
103};
104
105}
106
107#endif
#define LUCENE_CLASS(Name)
Definition LuceneObject.h:24
Utility template class to handle collections that can be safely copied and shared.
Definition Collection.h:17
Version
Definition Constants.h:40
A grammar-based tokenizer.
Definition StandardTokenizer.h:34
static const int32_t ACRONYM_DEP
Definition StandardTokenizer.h:74
virtual void end()
This method is called by the consumer after the last token has been consumed, after incrementToken() ...
StandardTokenizerImplPtr scanner
A private instance of the scanner.
Definition StandardTokenizer.h:52
StandardTokenizer(LuceneVersion::Version matchVersion, const ReaderPtr &input)
Creates a new instance of the StandardTokenizer. Attaches the input to the newly created scanner.
static const int32_t CJ
Definition StandardTokenizer.h:71
static const int32_t ACRONYM
Definition StandardTokenizer.h:66
static const Collection< String > TOKEN_TYPES()
String token types that correspond to token type int constants.
void setReplaceInvalidAcronym(bool replaceInvalidAcronym)
int32_t maxTokenLength
Definition StandardTokenizer.h:55
void init(const ReaderPtr &input, LuceneVersion::Version matchVersion)
void setMaxTokenLength(int32_t length)
Set the max allowed token length. Any token longer than this is skipped.
bool replaceInvalidAcronym
Definition StandardTokenizer.h:54
static const int32_t COMPANY
Definition StandardTokenizer.h:67
static const int32_t HOST
Definition StandardTokenizer.h:69
StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeFactoryPtr &factory, const ReaderPtr &input)
Creates a new StandardTokenizer with a given AttributeSource.AttributeFactory.
TypeAttributePtr typeAtt
Definition StandardTokenizer.h:61
StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeSourcePtr &source, const ReaderPtr &input)
Creates a new StandardTokenizer with a given AttributeSource.
TermAttributePtr termAtt
Definition StandardTokenizer.h:58
static const int32_t APOSTROPHE
Definition StandardTokenizer.h:65
virtual void reset(const ReaderPtr &input)
Reset the tokenizer to a new reader. Typically, an analyzer (in its reusableTokenStream method) will ...
static const int32_t NUM
Definition StandardTokenizer.h:70
PositionIncrementAttributePtr posIncrAtt
Definition StandardTokenizer.h:60
OffsetAttributePtr offsetAtt
Definition StandardTokenizer.h:59
virtual bool incrementToken()
static const int32_t ALPHANUM
Definition StandardTokenizer.h:64
static const int32_t EMAIL
Definition StandardTokenizer.h:68
A Tokenizer is a TokenStream whose input is a Reader.
Definition Tokenizer.h:20
Definition AbstractAllTermDocs.h:12
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition LuceneTypes.h:520
boost::shared_ptr< PositionIncrementAttribute > PositionIncrementAttributePtr
Definition LuceneTypes.h:45
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition LuceneTypes.h:58
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition LuceneTypes.h:40
boost::shared_ptr< StandardTokenizerImpl > StandardTokenizerImplPtr
Definition LuceneTypes.h:53
boost::shared_ptr< Reader > ReaderPtr
Definition LuceneTypes.h:547
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition LuceneTypes.h:519
boost::shared_ptr< TypeAttribute > TypeAttributePtr
Definition LuceneTypes.h:64

clucene.sourceforge.net