-
-
Notifications
You must be signed in to change notification settings - Fork 938
Expand file tree
/
Copy pathByteListHelper.java
More file actions
148 lines (126 loc) · 5.87 KB
/
ByteListHelper.java
File metadata and controls
148 lines (126 loc) · 5.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package org.jruby.util;
import org.jcodings.Encoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jruby.Ruby;
import static org.jruby.util.RubyStringBuilder.inspectIdentifierByteList;
import static org.jruby.util.RubyStringBuilder.str;
/**
* Helpers for working with bytelists.
*/
public class ByteListHelper {
public interface CodePoint {
/**
* call a command pattern caller with index of and actual codepoint. If false is
* returned it will let users of this interface to stop walking the codepoints.
* @param index the index of which codepoint we are visiting.
* @param codepoint we are visiting.
* @return true to continue walking and false to stop.
*/
boolean call(int index, int codepoint, Encoding encoding);
}
// Note: perhaps there is some great way of doing this with streams but I did not want to box index
// with value or make someone try and deal with that outside of the closure.
public interface Visit<T, U> {
U call(int index, T value, U module);
}
/**
* This method assumes the ByteList will be a valid string for the encoding which it is marked as.
* It also assumes slow path mbc walking. If you know you have an ASCII ByteList you should do something
* else.
* @param bytelist of the mbc-laden bytes
* @param each the closure which walks the codepoints
* @return true if it walks the whole bytelist or false if it stops.
*/
public static boolean eachCodePoint(ByteList bytelist, CodePoint each) {
byte[] bytes = bytelist.unsafeBytes();
int len = bytelist.getRealSize();
Encoding encoding = bytelist.getEncoding();
int begin = bytelist.begin();
int end = begin + len;
int n;
for (int i = 0; i < len; i += n) {
int realIndex = begin + i;
n = StringSupport.encFastMBCLen(bytes, realIndex, end, encoding);
if (!each.call(i, encoding.mbcToCode(bytes, realIndex, end), encoding)) return false;
}
return true;
}
/**
* If you know you have an ASCII ByteList you should do something else. This will continue walking the
* bytelist 'while' as long as each continues to be true. When it stops being true it will return the
* last byte index processed (on full walk it will be length otherwise the beginning of the codepoint
* which did not satisfy each).
*
* @param bytelist of the mbc-laden bytes
* @param offset place in bytes to search past begin
* @param each the closure which walks the codepoints
* @return length if all codepoints match. index (ignoring begin) if not.
*/
public static int eachCodePointWhile(Ruby runtime, ByteList bytelist, int offset, CodePoint each) {
Encoding encoding = bytelist.getEncoding();
if (encoding != USASCIIEncoding.INSTANCE) {
return eachMBCCodePointWhile(bytelist, offset, each);
}
byte[] bytes = bytelist.unsafeBytes();
int len = bytelist.getRealSize();
int begin = bytelist.begin();
int end = begin + len;
for (int i = offset; i < end; i++) {
byte c = bytes[i];
if (!Encoding.isAscii(c)) throw runtime.newEncodingError(str(runtime, "invalid symbol in encoding " + encoding + " :" , inspectIdentifierByteList(runtime, bytelist)));
if (!each.call(i, bytes[i] & 0xff, encoding)) return i;
}
return len;
}
// Should also call through eachCodePointWhile since it will fast path US-ASCII.
private static int eachMBCCodePointWhile(ByteList bytelist, int offset, CodePoint each) {
Encoding encoding = bytelist.getEncoding();
byte[] bytes = bytelist.unsafeBytes();
int len = bytelist.getRealSize();
int begin = bytelist.begin();
int end = begin + len;
int n;
for (int i = 0, p = begin + offset; p < end; i++, p += n) {
n = StringSupport.length(encoding, bytes, p, end);
if (!each.call(i, encoding.mbcToCode(bytes, p, end), encoding)) {
return p;
}
}
return len;
}
/**
* This method will split a string and call a visitor for each segment between the split pattern.
*
* @param value to be split
* @param pattern the pattern to split value with
* @param bodyVisitor visitor for all but last segment
* @param headVisitor visitor for the last segment (if null if will use bodyVisitor).
* @param <T> Return type of visitor
* @return last T from headVisitor
* @deprecated This was only used by Module#const_defined, but was difficult to match MRI's equivalent in this form
*/
@Deprecated(since = "9.4-")
public static <T> T split(ByteList value, ByteList pattern, Visit<ByteList, T> bodyVisitor, Visit<ByteList, T> headVisitor) {
if (headVisitor == null) headVisitor = bodyVisitor;
Encoding enc = pattern.getEncoding();
byte[] bytes = value.getUnsafeBytes();
int begin = value.getBegin();
int realSize = value.getRealSize();
int end = begin + realSize;
int currentOffset = 0;
int patternIndex;
int i = 0;
T current = null;
for (; currentOffset < realSize && (patternIndex = value.indexOf(pattern, currentOffset)) >= 0; i++) {
int t = enc.rightAdjustCharHead(bytes, currentOffset + begin, patternIndex + begin, end) - begin;
if (t != patternIndex) {
currentOffset = t;
continue;
}
current = bodyVisitor.call(i, value.makeShared(currentOffset, patternIndex - currentOffset), current);
if (current == null) return null;
currentOffset = patternIndex + pattern.getRealSize();
}
return headVisitor.call(i, value.makeShared(currentOffset, realSize - currentOffset), current);
}
}