1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.jexl3.parser;
18
19 import java.util.Objects;
20
21 /**
22 * Common constant strings utilities.
23 * <p>
24 * This package methods read JEXL string literals and handle escaping through the
25 * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
26 * and double quotes) and read Unicode hexadecimal encoded characters.
27 * </p>
28 * <p>
29 * The only escapable characters are the single and double quotes - ''' and '"' -,
30 * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
31 * the backslash character - '\' - itself.
32 * </p>
33 * <p>
34 * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
35 * sequence output being the same as the input.
36 * </p>
37 */
38 public class StringParser {
39 /** The length of an escaped unicode sequence. */
40 private static final int UCHAR_LEN = 4;
41
42 /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
43 private static final int SHIFT = 12;
44
45 /** The base 10 offset used to convert hexa characters to decimal. */
46 private static final int BASE10 = 10;
47
48 /** The last 7bits ASCII character. */
49 private static final char LAST_ASCII = 127;
50
51 /** The first printable 7bits ASCII character. */
52 private static final char FIRST_ASCII = 32;
53
54 /**
55 * Builds a regex pattern string, handles escaping '/' through '\/' syntax.
56 * @param str the string to build from
57 * @return the built string
58 */
59 public static String buildRegex(final CharSequence str) {
60 return buildString(str.subSequence(1, str.length()), true);
61 }
62 /**
63 * Builds a string, handles escaping through '\' syntax.
64 * @param str the string to build from
65 * @param eatsep whether the separator, the first character, should be considered
66 * @return the built string
67 */
68 public static String buildString(final CharSequence str, final boolean eatsep) {
69 return buildString(str, eatsep, true);
70 }
71
72 /**
73 * Builds a string, handles escaping through '\' syntax.
74 * @param str the string to build from
75 * @param eatsep whether the separator, the first character, should be considered
76 * @param esc whether escape characters are interpreted or escaped
77 * @return the built string
78 */
79 private static String buildString(final CharSequence str, final boolean eatsep, final boolean esc) {
80 final StringBuilder strb = new StringBuilder(str.length());
81 final char sep = eatsep ? str.charAt(0) : 0;
82 final int end = str.length() - (eatsep ? 1 : 0);
83 final int begin = eatsep ? 1 : 0;
84 read(strb, str, begin, end, sep, esc);
85 return strb.toString();
86 }
87 /**
88 * Builds a template, does not escape characters.
89 * @param str the string to build from
90 * @param eatsep whether the separator, the first character, should be considered
91 * @return the built string
92 */
93 public static String buildTemplate(final CharSequence str, final boolean eatsep) {
94 return buildString(str, eatsep, false);
95 }
96 /**
97 * Adds a escape char ('\') where needed in a string form of an ide
98 * @param str the identifier un-escaped string
99 * @return the string with added backslash character before space, quote, double-quote and backslash
100 */
101 public static String escapeIdentifier(final String str) {
102 StringBuilder strb = null;
103 if (str != null) {
104 int n = 0;
105 final int last = str.length();
106 while (n < last) {
107 final char c = str.charAt(n);
108 switch (c) {
109 case ' ':
110 case '\'':
111 case '"':
112 case '\\': {
113 if (strb == null) {
114 strb = new StringBuilder(last);
115 strb.append(str, 0, n);
116 }
117 strb.append('\\');
118 strb.append(c);
119 break;
120 }
121 default:
122 if (strb != null) {
123 strb.append(c);
124 }
125 }
126 n += 1;
127 }
128 }
129 return Objects.toString(strb, str);
130 }
131
132 /**
133 * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
134 * @param delim the delimiter character (if 0, no delimiter is added)
135 * @param str the string to escape
136 * @return the escaped representation
137 */
138 public static String escapeString(final CharSequence str, final char delim) {
139 if (str == null) {
140 return null;
141 }
142 final int length = str.length();
143 final StringBuilder strb = new StringBuilder(length + 2);
144 if (delim > 0) {
145 strb.append(delim);
146 }
147 for (int i = 0; i < length; ++i) {
148 final char c = str.charAt(i);
149 switch (c) {
150 case 0:
151 continue;
152 case '\b':
153 strb.append('\\');
154 strb.append('b');
155 break;
156 case '\t':
157 strb.append('\\');
158 strb.append('t');
159 break;
160 case '\n':
161 strb.append('\\');
162 strb.append('n');
163 break;
164 case '\f':
165 strb.append('\\');
166 strb.append('f');
167 break;
168 case '\r':
169 strb.append('\\');
170 strb.append('r');
171 break;
172 case '\\':
173 // we escape the backslash only if there is a delimiter
174 if (delim > 0) {
175 strb.append('\\');
176 }
177 strb.append('\\');
178 break;
179 default:
180 if (c == delim) {
181 strb.append('\\');
182 strb.append(delim);
183 } else if (c >= FIRST_ASCII && c <= LAST_ASCII) {
184 strb.append(c);
185 } else {
186 // convert to Unicode escape sequence
187 strb.append('\\');
188 strb.append('u');
189 final String hex = Integer.toHexString(c);
190 for (int h = hex.length(); h < UCHAR_LEN; ++h) {
191 strb.append('0');
192 }
193 strb.append(hex);
194 }
195 }
196 }
197 if (delim > 0) {
198 strb.append(delim);
199 }
200 return strb.toString();
201 }
202
203 /**
204 * Reads the remainder of a string till a given separator,
205 * handles escaping through '\' syntax.
206 * @param strb the destination buffer to copy characters into
207 * @param str the origin
208 * @param begin the relative offset in str to begin reading
209 * @param end the relative offset in str to end reading
210 * @param sep the separator, single or double quote, marking end of string
211 * @param esc whether escape characters are interpreted or escaped
212 * @return the last character offset handled in origin
213 */
214 private static int read(final StringBuilder strb, final CharSequence str, final int begin, final int end, final char sep, final boolean esc) {
215 boolean escape = false;
216 int index = begin;
217 for (; index < end; ++index) {
218 final char c = str.charAt(index);
219 if (escape) {
220 if (c == 'u' && index + UCHAR_LEN < end && readUnicodeChar(strb, str, index + 1) > 0) {
221 index += UCHAR_LEN;
222 } else {
223 // if c is not an escapable character, re-emmit the backslash before it
224 final boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep;
225 if (notSeparator && c != '\\') {
226 if (!esc) {
227 strb.append('\\').append(c);
228 } else {
229 switch (c) {
230 // https://es5.github.io/x7.html#x7.8.4
231 case 'b':
232 strb.append('\b');
233 break; // backspace \u0008
234 case 't':
235 strb.append('\t');
236 break; // horizontal tab \u0009
237 case 'n':
238 strb.append('\n');
239 break; // line feed \u000A
240 // We don't support vertical tab. If needed, the unicode (\u000B) should be used instead
241 case 'f':
242 strb.append('\f');
243 break; // form feed \u000C
244 case 'r':
245 strb.append('\r');
246 break; // carriage return \u000D
247 default:
248 strb.append('\\').append(c);
249 }
250 }
251 } else {
252 strb.append(c);
253 }
254 }
255 escape = false;
256 continue;
257 }
258 if (c == '\\') {
259 escape = true;
260 continue;
261 }
262 strb.append(c);
263 if (c == sep) {
264 break;
265 }
266 }
267 return index;
268 }
269 /**
270 * Reads the remainder of a string till a given separator,
271 * handles escaping through '\' syntax.
272 * @param strb the destination buffer to copy characters into
273 * @param str the origin
274 * @param index the offset into the origin
275 * @param sep the separator, single or double quote, marking end of string
276 * @return the offset in origin
277 */
278 public static int readString(final StringBuilder strb, final CharSequence str, final int index, final char sep) {
279 return read(strb, str, index, str.length(), sep, true);
280 }
281
282 /**
283 * Reads a Unicode escape character.
284 * @param strb the builder to write the character to
285 * @param str the sequence
286 * @param begin the begin offset in sequence (after the '\\u')
287 * @return 0 if char could not be read, 4 otherwise
288 */
289 private static int readUnicodeChar(final StringBuilder strb, final CharSequence str, final int begin) {
290 char xc = 0;
291 int bits = SHIFT;
292 int value;
293 for (int offset = 0; offset < UCHAR_LEN; ++offset) {
294 final char c = str.charAt(begin + offset);
295 if (c >= '0' && c <= '9') {
296 value = c - '0';
297 } else if (c >= 'a' && c <= 'h') {
298 value = c - 'a' + BASE10;
299 } else if (c >= 'A' && c <= 'H') {
300 value = c - 'A' + BASE10;
301 } else {
302 return 0;
303 }
304 xc |= value << bits;
305 bits -= UCHAR_LEN;
306 }
307 strb.append(xc);
308 return UCHAR_LEN;
309 }
310
311 /**
312 * Remove escape char ('\') from an identifier.
313 * @param str the identifier escaped string, ie with a backslash before space, quote, double-quote and backslash
314 * @return the string with no '\\' character
315 */
316 public static String unescapeIdentifier(final String str) {
317 StringBuilder strb = null;
318 if (str != null) {
319 int n = 0;
320 final int last = str.length();
321 while (n < last) {
322 final char c = str.charAt(n);
323 if (c == '\\') {
324 if (strb == null) {
325 strb = new StringBuilder(last);
326 strb.append(str, 0, n);
327 }
328 } else if (strb != null) {
329 strb.append(c);
330 }
331 n += 1;
332 }
333 }
334 return Objects.toString(strb, str);
335 }
336
337 /** Default constructor. */
338 protected StringParser() {
339 // nothing to initialize
340 }
341 }