Lucene++ - a full-featured, c++ search engine
API Documentation


Loading...
Searching...
No Matches
UTF8Stream.h
Go to the documentation of this file.
1
2// Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3// Distributable under the terms of either the Apache License (Version 2.0)
4// or the GNU Lesser General Public License.
6
7#ifndef UTF8STREAM_H
8#define UTF8STREAM_H
9
10#include "LuceneObject.h"
11
12namespace Lucene {
13
14class LPPAPI UTF8Base : public LuceneObject {
15public:
16 virtual ~UTF8Base();
18
19public:
20 static const uint16_t LEAD_SURROGATE_MIN;
21 static const uint16_t LEAD_SURROGATE_MAX;
22 static const uint16_t TRAIL_SURROGATE_MIN;
23 static const uint16_t TRAIL_SURROGATE_MAX;
24 static const uint16_t LEAD_OFFSET;
25 static const uint32_t SURROGATE_OFFSET;
26 static const uint32_t CODE_POINT_MAX;
27
28 static const wchar_t UNICODE_REPLACEMENT_CHAR;
29 static const wchar_t UNICODE_TERMINATOR;
30
31protected:
32 virtual uint32_t readNext() = 0;
33
34 uint8_t mask8(uint32_t b);
35 uint16_t mask16(uint32_t c);
36 bool isTrail(uint32_t b);
37 bool isSurrogate(uint32_t cp);
38 bool isLeadSurrogate(uint32_t cp);
39 bool isTrailSurrogate(uint32_t cp);
40 bool isValidCodePoint(uint32_t cp);
41 bool isOverlongSequence(uint32_t cp, int32_t length);
42};
43
44class UTF8Encoder : public UTF8Base {
45public:
46 UTF8Encoder(const wchar_t* unicodeBegin, const wchar_t* unicodeEnd);
47 virtual ~UTF8Encoder();
48
50
51protected:
52 const wchar_t* unicodeBegin;
53 const wchar_t* unicodeEnd;
54
55public:
56 int32_t encode(uint8_t* utf8, int32_t length);
57
58 int32_t utf16to8(uint8_t* utf8, int32_t length);
59 int32_t utf32to8(uint8_t* utf8, int32_t length);
60
61protected:
62 virtual uint32_t readNext();
63
64 uint8_t* appendChar(uint8_t* utf8, uint32_t cp);
65};
66
68public:
71
73
74protected:
76
77protected:
78 virtual uint32_t readNext();
79};
80
81class UTF8Decoder : public UTF8Base {
82public:
83 UTF8Decoder(const uint8_t* utf8Begin, const uint8_t* utf8End);
84 virtual ~UTF8Decoder();
85
87
88protected:
89 const uint8_t* utf8Begin;
90 const uint8_t* utf8End;
91
92public:
93 int32_t decode(wchar_t* unicode, int32_t length);
94
95 int32_t utf8to16(wchar_t* unicode, int32_t length);
96 int32_t utf8to32(wchar_t* unicode, int32_t length);
97
98protected:
99 virtual uint32_t readNext();
100
101 int32_t sequenceLength(uint32_t cp);
102 bool getSequence(uint32_t& cp, int32_t length);
103 bool isValidNext(uint32_t& cp);
104};
105
107public:
110
112
113protected:
115
116protected:
117 virtual uint32_t readNext();
118};
119
120class UTF16Decoder : public UTF8Base {
121public:
122 UTF16Decoder(const uint16_t* utf16Begin, const uint16_t* utf16End);
123 virtual ~UTF16Decoder();
124
126
127protected:
128 const uint16_t* utf16Begin;
129 const uint16_t* utf16End;
130
131public:
132 int32_t decode(wchar_t* unicode, int32_t length);
133
134 int32_t utf16to16(wchar_t* unicode, int32_t length);
135 int32_t utf16to32(wchar_t* unicode, int32_t length);
136
137protected:
138 virtual uint32_t readNext();
139};
140
141}
142
143#endif
#define LUCENE_CLASS(Name)
Definition LuceneObject.h:24
Base class for all Lucene classes.
Definition LuceneObject.h:31
Definition UTF8Stream.h:120
int32_t utf16to32(wchar_t *unicode, int32_t length)
int32_t utf16to16(wchar_t *unicode, int32_t length)
virtual uint32_t readNext()
const uint16_t * utf16End
Definition UTF8Stream.h:129
int32_t decode(wchar_t *unicode, int32_t length)
const uint16_t * utf16Begin
Definition UTF8Stream.h:128
UTF16Decoder(const uint16_t *utf16Begin, const uint16_t *utf16End)
Definition UTF8Stream.h:14
bool isLeadSurrogate(uint32_t cp)
bool isTrail(uint32_t b)
uint8_t mask8(uint32_t b)
static const wchar_t UNICODE_TERMINATOR
Definition UTF8Stream.h:29
static const uint16_t LEAD_OFFSET
Definition UTF8Stream.h:24
static const uint16_t TRAIL_SURROGATE_MAX
Definition UTF8Stream.h:23
virtual uint32_t readNext()=0
bool isSurrogate(uint32_t cp)
static const uint16_t LEAD_SURROGATE_MAX
Definition UTF8Stream.h:21
bool isTrailSurrogate(uint32_t cp)
static const uint32_t CODE_POINT_MAX
Definition UTF8Stream.h:26
static const uint16_t LEAD_SURROGATE_MIN
Definition UTF8Stream.h:20
bool isValidCodePoint(uint32_t cp)
bool isOverlongSequence(uint32_t cp, int32_t length)
static const uint32_t SURROGATE_OFFSET
Definition UTF8Stream.h:25
static const wchar_t UNICODE_REPLACEMENT_CHAR
Definition UTF8Stream.h:28
uint16_t mask16(uint32_t c)
virtual ~UTF8Base()
static const uint16_t TRAIL_SURROGATE_MIN
Definition UTF8Stream.h:22
Definition UTF8Stream.h:106
ReaderPtr reader
Definition UTF8Stream.h:114
virtual uint32_t readNext()
UTF8DecoderStream(const ReaderPtr &reader)
Definition UTF8Stream.h:81
bool isValidNext(uint32_t &cp)
int32_t utf8to32(wchar_t *unicode, int32_t length)
int32_t sequenceLength(uint32_t cp)
virtual uint32_t readNext()
int32_t utf8to16(wchar_t *unicode, int32_t length)
UTF8Decoder(const uint8_t *utf8Begin, const uint8_t *utf8End)
virtual ~UTF8Decoder()
bool getSequence(uint32_t &cp, int32_t length)
const uint8_t * utf8Begin
Definition UTF8Stream.h:89
const uint8_t * utf8End
Definition UTF8Stream.h:90
int32_t decode(wchar_t *unicode, int32_t length)
Definition UTF8Stream.h:67
virtual uint32_t readNext()
ReaderPtr reader
Definition UTF8Stream.h:75
UTF8EncoderStream(const ReaderPtr &reader)
Definition UTF8Stream.h:44
const wchar_t * unicodeEnd
Definition UTF8Stream.h:53
int32_t utf32to8(uint8_t *utf8, int32_t length)
UTF8Encoder(const wchar_t *unicodeBegin, const wchar_t *unicodeEnd)
uint8_t * appendChar(uint8_t *utf8, uint32_t cp)
int32_t encode(uint8_t *utf8, int32_t length)
const wchar_t * unicodeBegin
Definition UTF8Stream.h:52
int32_t utf16to8(uint8_t *utf8, int32_t length)
virtual ~UTF8Encoder()
virtual uint32_t readNext()
Definition AbstractAllTermDocs.h:12
boost::shared_ptr< Reader > ReaderPtr
Definition LuceneTypes.h:547

clucene.sourceforge.net