Bitcoin ABC
0.26.3
P2P Digital Currency
Loading...
Searching...
No Matches
src
univalue
include
univalue_utffilter.h
Go to the documentation of this file.
1
// Copyright 2016 Wladimir J. van der Laan
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or https://opensource.org/licenses/mit-license.php.
4
#ifndef BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
5
#define BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
6
7
#include <string>
8
13
class
JSONUTF8StringFilter
{
14
public
:
15
explicit
JSONUTF8StringFilter
(std::string &s)
16
:
str
(s),
is_valid
(
true
),
codepoint
(0),
state
(0),
surpair
(0) {}
17
// Write single 8-bit char (may be part of UTF-8 sequence)
18
void
push_back
(
uint8_t
ch
) {
19
if
(
state
== 0) {
20
if
(
ch
< 0x80)
// 7-bit ASCII, fast direct pass-through
21
str
.push_back(
ch
);
22
else
if
(
ch
< 0xc0)
// Mid-sequence character, invalid in this state
23
is_valid
=
false
;
24
else
if
(
ch
< 0xe0) {
// Start of 2-byte sequence
25
codepoint
= (
ch
& 0x1f) << 6;
26
state
= 6;
27
}
else
if
(
ch
< 0xf0) {
// Start of 3-byte sequence
28
codepoint
= (
ch
& 0x0f) << 12;
29
state
= 12;
30
}
else
if
(
ch
< 0xf8) {
// Start of 4-byte sequence
31
codepoint
= (
ch
& 0x07) << 18;
32
state
= 18;
33
}
else
// Reserved, invalid
34
is_valid
=
false
;
35
}
else
{
36
if
((
ch
& 0xc0) != 0x80)
// Not a continuation, invalid
37
is_valid
=
false
;
38
state
-= 6;
39
codepoint
|= (
ch
& 0x3f) <<
state
;
40
if
(
state
== 0)
push_back_u
(
codepoint
);
41
}
42
}
43
// Write codepoint directly, possibly collating surrogate pairs
44
void
push_back_u
(
unsigned
int
codepoint_
) {
45
if
(
state
)
// Only accept full codepoints in open state
46
is_valid
=
false
;
47
if
(
codepoint_
>= 0xD800 &&
48
codepoint_
< 0xDC00) {
// First half of surrogate pair
49
if
(
surpair
)
// Two subsequent surrogate pair openers - fail
50
is_valid
=
false
;
51
else
52
surpair
=
codepoint_
;
53
}
else
if
(
codepoint_
>= 0xDC00 &&
54
codepoint_
< 0xE000) {
// Second half of surrogate pair
55
if
(
surpair
) {
// Open surrogate pair, expect second half
56
// Compute code point from UTF-16 surrogate pair
57
append_codepoint
(0x10000 | ((
surpair
- 0xD800) << 10) |
58
(
codepoint_
- 0xDC00));
59
surpair
= 0;
60
}
else
// Second half doesn't follow a first half - fail
61
is_valid
=
false
;
62
}
else
{
63
if
(
surpair
)
// First half of surrogate pair not followed by second
64
// - fail
65
is_valid
=
false
;
66
else
67
append_codepoint
(
codepoint_
);
68
}
69
}
70
// Check that we're in a state where the string can be ended
71
// No open sequences, no open surrogate pairs, etc
72
bool
finalize
() {
73
if
(
state
||
surpair
)
is_valid
=
false
;
74
return
is_valid
;
75
}
76
77
private
:
78
std::string &
str
;
79
bool
is_valid
;
80
// Current UTF-8 decoding state
81
unsigned
int
codepoint
;
82
int
state
;
// Top bit to be filled in for next UTF-8 byte, or 0
83
84
// Keep track of the following state to handle the following section of
85
// RFC4627:
86
//
87
// To escape an extended character that is not in the Basic Multilingual
88
// Plane, the character is represented as a twelve-character sequence,
89
// encoding the UTF-16 surrogate pair. So, for example, a string
90
// containing only the G clef character (U+1D11E) may be represented as
91
// "\uD834\uDD1E".
92
//
93
// Two subsequent \u.... may have to be replaced with one actual codepoint.
94
unsigned
int
surpair
;
// First half of open UTF-16 surrogate pair, or 0
95
96
void
append_codepoint
(
unsigned
int
codepoint_
) {
97
if
(
codepoint_
<= 0x7f)
98
str
.push_back((
char
)
codepoint_
);
99
else
if
(
codepoint_
<= 0x7FF) {
100
str
.push_back((
char
)(0xC0 | (
codepoint_
>> 6)));
101
str
.push_back((
char
)(0x80 | (
codepoint_
& 0x3F)));
102
}
else
if
(
codepoint_
<= 0xFFFF) {
103
str
.push_back((
char
)(0xE0 | (
codepoint_
>> 12)));
104
str
.push_back((
char
)(0x80 | ((
codepoint_
>> 6) & 0x3F)));
105
str
.push_back((
char
)(0x80 | (
codepoint_
& 0x3F)));
106
}
else
if
(
codepoint_
<= 0x1FFFFF) {
107
str
.push_back((
char
)(0xF0 | (
codepoint_
>> 18)));
108
str
.push_back((
char
)(0x80 | ((
codepoint_
>> 12) & 0x3F)));
109
str
.push_back((
char
)(0x80 | ((
codepoint_
>> 6) & 0x3F)));
110
str
.push_back((
char
)(0x80 | (
codepoint_
& 0x3F)));
111
}
112
}
113
};
114
115
#endif
// BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
JSONUTF8StringFilter
Filter that generates and validates UTF-8, as well as collates UTF-16 surrogate pairs as specified in...
Definition
univalue_utffilter.h:13
JSONUTF8StringFilter::surpair
unsigned int surpair
Definition
univalue_utffilter.h:94
JSONUTF8StringFilter::is_valid
bool is_valid
Definition
univalue_utffilter.h:79
JSONUTF8StringFilter::codepoint
unsigned int codepoint
Definition
univalue_utffilter.h:81
JSONUTF8StringFilter::finalize
bool finalize()
Definition
univalue_utffilter.h:72
JSONUTF8StringFilter::state
int state
Definition
univalue_utffilter.h:82
JSONUTF8StringFilter::push_back_u
void push_back_u(unsigned int codepoint_)
Definition
univalue_utffilter.h:44
JSONUTF8StringFilter::append_codepoint
void append_codepoint(unsigned int codepoint_)
Definition
univalue_utffilter.h:96
JSONUTF8StringFilter::push_back
void push_back(uint8_t ch)
Definition
univalue_utffilter.h:18
JSONUTF8StringFilter::str
std::string & str
Definition
univalue_utffilter.h:78
JSONUTF8StringFilter::JSONUTF8StringFilter
JSONUTF8StringFilter(std::string &s)
Definition
univalue_utffilter.h:15
GetRand
T GetRand(T nMax=std::numeric_limits< T >::max()) noexcept
Generate a uniform random integer of type T in the range [0..nMax) nMax defaults to std::numeric_limi...
Definition
random.h:85
Generated on Fri Dec 27 2024 02:38:19 for Bitcoin ABC by
1.9.8