View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.jexl3.parser;
18  
19  import java.util.Objects;
20  
21  /**
22   * Common constant strings utilities.
23   * <p>
24   * This package methods read JEXL string literals and handle escaping through the
25   * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
26   * and double quotes) and read Unicode hexadecimal encoded characters.
27   * </p>
28   * <p>
29   * The only escapable characters are the single and double quotes - ''' and '"' -,
30   * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
31   * the backslash character - '\' - itself.
32   * </p>
33   * <p>
34   * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
35   * sequence output being the same as the input.
36   * </p>
37   */
38  public class StringParser {
39      /** The length of an escaped unicode sequence. */
40      private static final int UCHAR_LEN = 4;
41  
42      /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
43      private static final int SHIFT = 12;
44  
45      /** The base 10 offset used to convert hexa characters to decimal. */
46      private static final int BASE10 = 10;
47  
48      /** The last 7bits ASCII character. */
49      private static final char LAST_ASCII = 127;
50  
51      /** The first printable 7bits ASCII character. */
52      private static final char FIRST_ASCII = 32;
53  
54      /**
55       * Builds a regex pattern string, handles escaping '/' through '\/' syntax.
56       * @param str the string to build from
57       * @return the built string
58       */
59      public static String buildRegex(final CharSequence str) {
60          return buildString(str.subSequence(1, str.length()), true);
61      }
62      /**
63       * Builds a string, handles escaping through '\' syntax.
64       * @param str the string to build from
65       * @param eatsep whether the separator, the first character, should be considered
66       * @return the built string
67       */
68      public static String buildString(final CharSequence str, final boolean eatsep) {
69          return buildString(str, eatsep, true);
70      }
71  
72      /**
73       * Builds a string, handles escaping through '\' syntax.
74       * @param str the string to build from
75       * @param eatsep whether the separator, the first character, should be considered
76       * @param esc whether escape characters are interpreted or escaped
77       * @return the built string
78       */
79      private static String buildString(final CharSequence str, final boolean eatsep, final boolean esc) {
80          final StringBuilder strb = new StringBuilder(str.length());
81          final char sep = eatsep ? str.charAt(0) : 0;
82          final int end = str.length() - (eatsep ? 1 : 0);
83          final int begin = eatsep ? 1 : 0;
84          read(strb, str, begin, end, sep, esc);
85          return strb.toString();
86      }
87      /**
88       * Builds a template, does not escape characters.
89       * @param str the string to build from
90       * @param eatsep whether the separator, the first character, should be considered
91       * @return the built string
92       */
93      public static String buildTemplate(final CharSequence str, final boolean eatsep) {
94          return buildString(str, eatsep, false);
95      }
96      /**
97       * Adds a escape char ('\') where needed in a string form of an ide
98       * @param str the identifier un-escaped string
99       * @return the string with added  backslash character before space, quote, double-quote and backslash
100      */
101     public static String escapeIdentifier(final String str) {
102         StringBuilder strb = null;
103         if (str != null) {
104             int n = 0;
105             final int last = str.length();
106             while (n < last) {
107                 final char c = str.charAt(n);
108                 switch (c) {
109                     case ' ':
110                     case '\'':
111                     case '"':
112                     case '\\': {
113                         if (strb == null) {
114                             strb = new StringBuilder(last);
115                             strb.append(str, 0, n);
116                         }
117                         strb.append('\\');
118                         strb.append(c);
119                         break;
120                     }
121                     default:
122                         if (strb != null) {
123                             strb.append(c);
124                         }
125                 }
126                 n += 1;
127             }
128         }
129         return Objects.toString(strb, str);
130     }
131 
132     /**
133      * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
134      * @param delim the delimiter character (if 0, no delimiter is added)
135      * @param str the string to escape
136      * @return the escaped representation
137      */
138     public static String escapeString(final CharSequence str, final char delim) {
139         if (str == null) {
140             return null;
141         }
142         final int length = str.length();
143         final StringBuilder strb = new StringBuilder(length + 2);
144         if (delim > 0) {
145             strb.append(delim);
146         }
147         for (int i = 0; i < length; ++i) {
148             final char c = str.charAt(i);
149             switch (c) {
150                 case 0:
151                     continue;
152                 case '\b':
153                     strb.append('\\');
154                     strb.append('b');
155                     break;
156                 case '\t':
157                     strb.append('\\');
158                     strb.append('t');
159                     break;
160                 case '\n':
161                     strb.append('\\');
162                     strb.append('n');
163                     break;
164                 case '\f':
165                     strb.append('\\');
166                     strb.append('f');
167                     break;
168                 case '\r':
169                     strb.append('\\');
170                     strb.append('r');
171                     break;
172                 case '\\':
173                     // we escape the backslash only if there is a delimiter
174                     if (delim > 0) {
175                         strb.append('\\');
176                     }
177                     strb.append('\\');
178                     break;
179                 default:
180                     if (c == delim) {
181                         strb.append('\\');
182                         strb.append(delim);
183                     } else if (c >= FIRST_ASCII && c <= LAST_ASCII) {
184                         strb.append(c);
185                     } else {
186                         // convert to Unicode escape sequence
187                         strb.append('\\');
188                         strb.append('u');
189                         final String hex = Integer.toHexString(c);
190                         for (int h = hex.length(); h < UCHAR_LEN; ++h) {
191                             strb.append('0');
192                         }
193                         strb.append(hex);
194                     }
195             }
196         }
197         if (delim > 0) {
198             strb.append(delim);
199         }
200         return strb.toString();
201     }
202 
203     /**
204      * Reads the remainder of a string till a given separator,
205      * handles escaping through '\' syntax.
206      * @param strb the destination buffer to copy characters into
207      * @param str the origin
208      * @param begin the relative offset in str to begin reading
209      * @param end the relative offset in str to end reading
210      * @param sep the separator, single or double quote, marking end of string
211      * @param esc whether escape characters are interpreted or escaped
212      * @return the last character offset handled in origin
213      */
214     private static int read(final StringBuilder strb, final CharSequence str, final int begin, final int end, final char sep, final boolean esc) {
215         boolean escape = false;
216         int index = begin;
217         for (; index < end; ++index) {
218             final char c = str.charAt(index);
219             if (escape) {
220                 if (c == 'u' && index + UCHAR_LEN < end && readUnicodeChar(strb, str, index + 1) > 0) {
221                     index += UCHAR_LEN;
222                 } else {
223                     // if c is not an escapable character, re-emmit the backslash before it
224                     final boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep;
225                     if (notSeparator && c != '\\') {
226                         if (!esc) {
227                             strb.append('\\').append(c);
228                         } else {
229                             switch (c) {
230                                 // https://es5.github.io/x7.html#x7.8.4
231                                 case 'b':
232                                     strb.append('\b');
233                                     break; // backspace \u0008
234                                 case 't':
235                                     strb.append('\t');
236                                     break; // horizontal tab \u0009
237                                 case 'n':
238                                     strb.append('\n');
239                                     break; // line feed \u000A
240                                 // We don't support vertical tab. If needed, the unicode (\u000B) should be used instead
241                                 case 'f':
242                                     strb.append('\f');
243                                     break; // form feed \u000C
244                                 case 'r':
245                                     strb.append('\r');
246                                     break; // carriage return \u000D
247                                 default:
248                                     strb.append('\\').append(c);
249                             }
250                         }
251                     } else {
252                         strb.append(c);
253                     }
254                 }
255                 escape = false;
256                 continue;
257             }
258             if (c == '\\') {
259                 escape = true;
260                 continue;
261             }
262             strb.append(c);
263             if (c == sep) {
264                 break;
265             }
266         }
267         return index;
268     }
269     /**
270      * Reads the remainder of a string till a given separator,
271      * handles escaping through '\' syntax.
272      * @param strb the destination buffer to copy characters into
273      * @param str the origin
274      * @param index the offset into the origin
275      * @param sep the separator, single or double quote, marking end of string
276      * @return the offset in origin
277      */
278     public static int readString(final StringBuilder strb, final CharSequence str, final int index, final char sep) {
279         return read(strb, str, index, str.length(), sep, true);
280     }
281 
282     /**
283      * Reads a Unicode escape character.
284      * @param strb the builder to write the character to
285      * @param str the sequence
286      * @param begin the begin offset in sequence (after the '\\u')
287      * @return 0 if char could not be read, 4 otherwise
288      */
289     private static int readUnicodeChar(final StringBuilder strb, final CharSequence str, final int begin) {
290         char xc = 0;
291         int bits = SHIFT;
292         int value;
293         for (int offset = 0; offset < UCHAR_LEN; ++offset) {
294             final char c = str.charAt(begin + offset);
295             if (c >= '0' && c <= '9') {
296                 value = c - '0';
297             } else if (c >= 'a' && c <= 'h') {
298                 value = c - 'a' + BASE10;
299             } else if (c >= 'A' && c <= 'H') {
300                 value = c - 'A' + BASE10;
301             } else {
302                 return 0;
303             }
304             xc |= value << bits;
305             bits -= UCHAR_LEN;
306         }
307         strb.append(xc);
308         return UCHAR_LEN;
309     }
310 
311     /**
312      * Remove escape char ('\') from an identifier.
313      * @param str the identifier escaped string, ie with a backslash before space, quote, double-quote and backslash
314      * @return the string with no '\\' character
315      */
316     public static String unescapeIdentifier(final String str) {
317         StringBuilder strb = null;
318         if (str != null) {
319             int n = 0;
320             final int last = str.length();
321             while (n < last) {
322                 final char c = str.charAt(n);
323                 if (c == '\\') {
324                     if (strb == null) {
325                         strb = new StringBuilder(last);
326                         strb.append(str, 0, n);
327                     }
328                 } else if (strb != null) {
329                     strb.append(c);
330                 }
331                 n += 1;
332             }
333         }
334         return Objects.toString(strb, str);
335     }
336 
337     /** Default constructor.  */
338     protected StringParser() {
339         // nothing to initialize
340     }
341 }