001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.lang3.ArrayUtils; 026import org.apache.commons.lang3.StringUtils; 027 028/** 029 * Tokenizes a string based on delimiters (separators) 030 * and supporting quoting and ignored character concepts. 031 * <p> 032 * This class can split a String into many smaller strings. It aims 033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 034 * however it offers much more control and flexibility including implementing 035 * the {@code ListIterator} interface. By default, it is set up 036 * like {@code StringTokenizer}. 037 * <p> 038 * The input String is split into a number of <em>tokens</em>. 039 * Each token is separated from the next String by a <em>delimiter</em>. 040 * One or more delimiter characters must be specified. 041 * <p> 042 * Each token may be surrounded by quotes. 043 * The <em>quote</em> matcher specifies the quote character(s). 044 * A quote may be escaped within a quoted section by duplicating itself. 045 * <p> 046 * Between each token and the delimiter are potentially characters that need trimming. 047 * The <em>trimmer</em> matcher specifies these characters. 048 * One usage might be to trim whitespace characters. 049 * <p> 050 * At any point outside the quotes there might potentially be invalid characters. 051 * The <em>ignored</em> matcher specifies these characters to be removed. 052 * One usage might be to remove new line characters. 053 * <p> 054 * Empty tokens may be removed or returned as null. 055 * <pre> 056 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 057 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 059 * </pre> 060 * 061 * <table> 062 * <caption>StrTokenizer properties and options</caption> 063 * <tr> 064 * <th>Property</th><th>Type</th><th>Default</th> 065 * </tr> 066 * <tr> 067 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 068 * </tr> 069 * <tr> 070 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 071 * </tr> 072 * <tr> 073 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 074 * </tr> 075 * <tr> 076 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 077 * </tr> 078 * <tr> 079 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 080 * </tr> 081 * </table> 082 * 083 * @since 1.0 084 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0. 085 */ 086@Deprecated 087public class StrTokenizer implements ListIterator<String>, Cloneable { 088 089 /** Comma separated values tokenizer internal variable. */ 090 // @formatter:off 091 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 092 .setDelimiterMatcher(StrMatcher.commaMatcher()) 093 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 094 .setIgnoredMatcher(StrMatcher.noneMatcher()) 095 .setTrimmerMatcher(StrMatcher.trimMatcher()) 096 .setEmptyTokenAsNull(false) 097 .setIgnoreEmptyTokens(false); 098 // @formatter:on 099 100 /** Tab separated values tokenizer internal variable. */ 101 // @formatter:off 102 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 103 .setDelimiterMatcher(StrMatcher.tabMatcher()) 104 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 105 .setIgnoredMatcher(StrMatcher.noneMatcher()) 106 .setTrimmerMatcher(StrMatcher.trimMatcher()) 107 .setEmptyTokenAsNull(false) 108 .setIgnoreEmptyTokens(false); 109 // @formatter:on 110 111 /** 112 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 113 * 114 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 115 */ 116 private static StrTokenizer getCSVClone() { 117 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 118 } 119 120 /** 121 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be 122 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 123 * <p> 124 * You must call a "reset" method to set the string which you want to parse. 125 * </p> 126 * 127 * @return a new tokenizer instance which parses Comma Separated Value strings. 128 */ 129 public static StrTokenizer getCSVInstance() { 130 return getCSVClone(); 131 } 132 133 /** 134 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be 135 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 136 * 137 * @param input the text to parse. 138 * @return a new tokenizer instance which parses Comma Separated Value strings. 139 */ 140 public static StrTokenizer getCSVInstance(final char[] input) { 141 final StrTokenizer tok = getCSVClone(); 142 tok.reset(input); 143 return tok; 144 } 145 146 /** 147 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be 148 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 149 * 150 * @param input the text to parse. 151 * @return a new tokenizer instance which parses Comma Separated Value strings. 152 */ 153 public static StrTokenizer getCSVInstance(final String input) { 154 final StrTokenizer tok = getCSVClone(); 155 tok.reset(input); 156 return tok; 157 } 158 159 /** 160 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 161 * 162 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 163 */ 164 private static StrTokenizer getTSVClone() { 165 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 166 } 167 168 /** 169 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can 170 * be overridden with the setTrimmer method). 171 * <p> 172 * You must call a "reset" method to set the string which you want to parse. 173 * </p> 174 * 175 * @return a new tokenizer instance which parses Tab Separated Value strings. 176 */ 177 public static StrTokenizer getTSVInstance() { 178 return getTSVClone(); 179 } 180 181 /** 182 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can 183 * be overridden with the setTrimmer method). 184 * 185 * @param input the string to parse. 186 * @return a new tokenizer instance which parses Tab Separated Value strings. 187 */ 188 public static StrTokenizer getTSVInstance(final char[] input) { 189 final StrTokenizer tok = getTSVClone(); 190 tok.reset(input); 191 return tok; 192 } 193 194 /** 195 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can 196 * be overridden with the setTrimmer method). 197 * 198 * @param input the string to parse. 199 * @return a new tokenizer instance which parses Tab Separated Value strings. 200 */ 201 public static StrTokenizer getTSVInstance(final String input) { 202 final StrTokenizer tok = getTSVClone(); 203 tok.reset(input); 204 return tok; 205 } 206 207 /** The text to work on. */ 208 private char[] chars; 209 210 /** The parsed tokens. */ 211 private String[] tokens; 212 213 /** The current iteration position. */ 214 private int tokenPos; 215 216 /** The delimiter matcher. */ 217 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 218 219 /** The quote matcher. */ 220 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 221 222 /** The ignored matcher. */ 223 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 224 225 /** The trimmer matcher. */ 226 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 227 228 /** Whether to return empty tokens as null. */ 229 private boolean emptyAsNull; 230 231 /** Whether to ignore empty tokens. */ 232 private boolean ignoreEmptyTokens = true; 233 234 /** 235 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize. 236 * <p> 237 * This constructor is normally used with {@link #reset(String)}. 238 * </p> 239 */ 240 public StrTokenizer() { 241 this.chars = null; 242 } 243 244 /** 245 * Constructs a tokenizer splitting on space, tab, newline and form feed 246 * as per StringTokenizer. 247 * 248 * @param input the string which is to be parsed, not cloned. 249 */ 250 public StrTokenizer(final char[] input) { 251 if (input == null) { 252 this.chars = null; 253 } else { 254 this.chars = input.clone(); 255 } 256 } 257 258 /** 259 * Constructs a tokenizer splitting on the specified character. 260 * 261 * @param input the string which is to be parsed, not cloned. 262 * @param delim the field delimiter character. 263 */ 264 public StrTokenizer(final char[] input, final char delim) { 265 this(input); 266 setDelimiterChar(delim); 267 } 268 269 /** 270 * Constructs a tokenizer splitting on the specified delimiter character 271 * and handling quotes using the specified quote character. 272 * 273 * @param input the string which is to be parsed, not cloned. 274 * @param delim the field delimiter character. 275 * @param quote the field quoted string character. 276 */ 277 public StrTokenizer(final char[] input, final char delim, final char quote) { 278 this(input, delim); 279 setQuoteChar(quote); 280 } 281 282 /** 283 * Constructs a tokenizer splitting on the specified string. 284 * 285 * @param input the string which is to be parsed, not cloned. 286 * @param delim the field delimiter string. 287 */ 288 public StrTokenizer(final char[] input, final String delim) { 289 this(input); 290 setDelimiterString(delim); 291 } 292 293 /** 294 * Constructs a tokenizer splitting using the specified delimiter matcher. 295 * 296 * @param input the string which is to be parsed, not cloned. 297 * @param delim the field delimiter matcher. 298 */ 299 public StrTokenizer(final char[] input, final StrMatcher delim) { 300 this(input); 301 setDelimiterMatcher(delim); 302 } 303 304 /** 305 * Constructs a tokenizer splitting using the specified delimiter matcher 306 * and handling quotes using the specified quote matcher. 307 * 308 * @param input the string which is to be parsed, not cloned. 309 * @param delim the field delimiter character. 310 * @param quote the field quoted string character. 311 */ 312 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 313 this(input, delim); 314 setQuoteMatcher(quote); 315 } 316 317 /** 318 * Constructs a tokenizer splitting on space, tab, newline and form feed 319 * as per StringTokenizer. 320 * 321 * @param input the string which is to be parsed. 322 */ 323 public StrTokenizer(final String input) { 324 if (input != null) { 325 chars = input.toCharArray(); 326 } else { 327 chars = null; 328 } 329 } 330 331 /** 332 * Constructs a tokenizer splitting on the specified delimiter character. 333 * 334 * @param input the string which is to be parsed. 335 * @param delim the field delimiter character. 336 */ 337 public StrTokenizer(final String input, final char delim) { 338 this(input); 339 setDelimiterChar(delim); 340 } 341 342 /** 343 * Constructs a tokenizer splitting on the specified delimiter character 344 * and handling quotes using the specified quote character. 345 * 346 * @param input the string which is to be parsed. 347 * @param delim the field delimiter character. 348 * @param quote the field quoted string character. 349 */ 350 public StrTokenizer(final String input, final char delim, final char quote) { 351 this(input, delim); 352 setQuoteChar(quote); 353 } 354 355 /** 356 * Constructs a tokenizer splitting on the specified delimiter string. 357 * 358 * @param input the string which is to be parsed. 359 * @param delim the field delimiter string. 360 */ 361 public StrTokenizer(final String input, final String delim) { 362 this(input); 363 setDelimiterString(delim); 364 } 365 366 /** 367 * Constructs a tokenizer splitting using the specified delimiter matcher. 368 * 369 * @param input the string which is to be parsed. 370 * @param delim the field delimiter matcher. 371 */ 372 public StrTokenizer(final String input, final StrMatcher delim) { 373 this(input); 374 setDelimiterMatcher(delim); 375 } 376 377 /** 378 * Constructs a tokenizer splitting using the specified delimiter matcher 379 * and handling quotes using the specified quote matcher. 380 * 381 * @param input the string which is to be parsed. 382 * @param delim the field delimiter matcher. 383 * @param quote the field quoted string matcher. 384 */ 385 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 386 this(input, delim); 387 setQuoteMatcher(quote); 388 } 389 390 /** 391 * Unsupported ListIterator operation. 392 * 393 * @param obj this parameter ignored. 394 * @throws UnsupportedOperationException always. 395 */ 396 @Override 397 public void add(final String obj) { 398 throw new UnsupportedOperationException("add() is unsupported"); 399 } 400 401 /** 402 * Adds a token to a list, paying attention to the parameters we've set. 403 * 404 * @param list the list to add to. 405 * @param tok the token to add. 406 */ 407 private void addToken(final List<String> list, String tok) { 408 if (tok == null || tok.isEmpty()) { 409 if (isIgnoreEmptyTokens()) { 410 return; 411 } 412 if (isEmptyTokenAsNull()) { 413 tok = null; 414 } 415 } 416 list.add(tok); 417 } 418 419 /** 420 * Checks if tokenization has been done, and if not then do it. 421 */ 422 private void checkTokenized() { 423 if (tokens == null) { 424 if (chars == null) { 425 // still call tokenize as subclass may do some work 426 final List<String> split = tokenize(null, 0, 0); 427 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 428 } else { 429 final List<String> split = tokenize(chars, 0, chars.length); 430 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 431 } 432 } 433 } 434 435 /** 436 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a 437 * {@link CloneNotSupportedException} is caught, return {@code null}. 438 * 439 * @return a new instance of this Tokenizer which has been reset. 440 */ 441 @Override 442 public Object clone() { 443 try { 444 return cloneReset(); 445 } catch (final CloneNotSupportedException ex) { 446 return null; 447 } 448 } 449 450 /** 451 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. 452 * 453 * @return a new instance of this Tokenizer which has been reset. 454 * @throws CloneNotSupportedException if there is a problem cloning. 455 */ 456 Object cloneReset() throws CloneNotSupportedException { 457 // this method exists to enable 100% test coverage 458 final StrTokenizer cloned = (StrTokenizer) super.clone(); 459 if (cloned.chars != null) { 460 cloned.chars = cloned.chars.clone(); 461 } 462 cloned.reset(); 463 return cloned; 464 } 465 466 /** 467 * Gets the String content that the tokenizer is parsing. 468 * 469 * @return The string content being parsed. 470 */ 471 public String getContent() { 472 if (chars == null) { 473 return null; 474 } 475 return new String(chars); 476 } 477 478 /** 479 * Gets the field delimiter matcher. 480 * 481 * @return The delimiter matcher in use. 482 */ 483 public StrMatcher getDelimiterMatcher() { 484 return this.delimMatcher; 485 } 486 487 /** 488 * Gets the ignored character matcher. 489 * <p> 490 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything. 491 * </p> 492 * 493 * @return The ignored matcher in use. 494 */ 495 public StrMatcher getIgnoredMatcher() { 496 return ignoredMatcher; 497 } 498 499 /** 500 * Gets the quote matcher currently in use. 501 * <p> 502 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote). 503 * </p> 504 * 505 * @return The quote matcher in use. 506 */ 507 public StrMatcher getQuoteMatcher() { 508 return quoteMatcher; 509 } 510 511 /** 512 * Gets a copy of the full token list as an independent modifiable array. 513 * 514 * @return The tokens as a String array. 515 */ 516 public String[] getTokenArray() { 517 checkTokenized(); 518 return tokens.clone(); 519 } 520 521 /** 522 * Gets a copy of the full token list as an independent modifiable list. 523 * 524 * @return The tokens as a String array. 525 */ 526 public List<String> getTokenList() { 527 checkTokenized(); 528 final List<String> list = new ArrayList<>(tokens.length); 529 Collections.addAll(list, tokens); 530 531 return list; 532 } 533 534 /** 535 * Gets the trimmer character matcher. 536 * <p> 537 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything. 538 * </p> 539 * 540 * @return The trimmer matcher in use. 541 */ 542 public StrMatcher getTrimmerMatcher() { 543 return trimmerMatcher; 544 } 545 546 /** 547 * Checks whether there are any more tokens. 548 * 549 * @return true if there are more tokens. 550 */ 551 @Override 552 public boolean hasNext() { 553 checkTokenized(); 554 return tokenPos < tokens.length; 555 } 556 557 /** 558 * Checks whether there are any previous tokens that can be iterated to. 559 * 560 * @return true if there are previous tokens. 561 */ 562 @Override 563 public boolean hasPrevious() { 564 checkTokenized(); 565 return tokenPos > 0; 566 } 567 568 /** 569 * Gets whether the tokenizer currently returns empty tokens as null. 570 * The default for this property is false. 571 * 572 * @return true if empty tokens are returned as null. 573 */ 574 public boolean isEmptyTokenAsNull() { 575 return this.emptyAsNull; 576 } 577 578 /** 579 * Gets whether the tokenizer currently ignores empty tokens. 580 * The default for this property is true. 581 * 582 * @return true if empty tokens are not returned. 583 */ 584 public boolean isIgnoreEmptyTokens() { 585 return ignoreEmptyTokens; 586 } 587 588 /** 589 * Checks if the characters at the index specified match the quote 590 * already matched in readNextToken(). 591 * 592 * @param srcChars the character array being tokenized. 593 * @param pos the position to check for a quote. 594 * @param len the length of the character array being tokenized. 595 * @param quoteStart the start position of the matched quote, 0 if no quoting. 596 * @param quoteLen the length of the matched quote, 0 if no quoting. 597 * @return true if a quote is matched. 598 */ 599 private boolean isQuote(final char[] srcChars, 600 final int pos, 601 final int len, 602 final int quoteStart, 603 final int quoteLen) { 604 for (int i = 0; i < quoteLen; i++) { 605 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 606 return false; 607 } 608 } 609 return true; 610 } 611 612 /** 613 * Gets the next token. 614 * 615 * @return The next String token. 616 * @throws NoSuchElementException if there are no more elements. 617 */ 618 @Override 619 public String next() { 620 if (hasNext()) { 621 return tokens[tokenPos++]; 622 } 623 throw new NoSuchElementException(); 624 } 625 626 /** 627 * Gets the index of the next token to return. 628 * 629 * @return The next token index. 630 */ 631 @Override 632 public int nextIndex() { 633 return tokenPos; 634 } 635 636 /** 637 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no 638 * tokens remain. 639 * 640 * @return The next sequential token, or null when no more tokens are found. 641 */ 642 public String nextToken() { 643 if (hasNext()) { 644 return tokens[tokenPos++]; 645 } 646 return null; 647 } 648 649 /** 650 * Gets the token previous to the last returned token. 651 * 652 * @return The previous token. 653 */ 654 @Override 655 public String previous() { 656 if (hasPrevious()) { 657 return tokens[--tokenPos]; 658 } 659 throw new NoSuchElementException(); 660 } 661 662 /** 663 * Gets the index of the previous token. 664 * 665 * @return The previous token index. 666 */ 667 @Override 668 public int previousIndex() { 669 return tokenPos - 1; 670 } 671 672 /** 673 * Gets the previous token from the String. 674 * 675 * @return The previous sequential token, or null when no more tokens are found. 676 */ 677 public String previousToken() { 678 if (hasPrevious()) { 679 return tokens[--tokenPos]; 680 } 681 return null; 682 } 683 684 /** 685 * Reads character by character through the String to get the next token. 686 * 687 * @param srcChars the character array being tokenized. 688 * @param start the first character of field. 689 * @param len the length of the character array being tokenized. 690 * @param workArea a temporary work area. 691 * @param tokenList the list of parsed tokens. 692 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found. 693 */ 694 private int readNextToken(final char[] srcChars, 695 int start, 696 final int len, 697 final StrBuilder workArea, 698 final List<String> tokenList) { 699 // skip all leading whitespace, unless it is the 700 // field delimiter or the quote character 701 while (start < len) { 702 final int removeLen = Math.max( 703 getIgnoredMatcher().isMatch(srcChars, start, start, len), 704 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 705 if (removeLen == 0 706 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 707 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 708 break; 709 } 710 start += removeLen; 711 } 712 713 // handle reaching end 714 if (start >= len) { 715 addToken(tokenList, StringUtils.EMPTY); 716 return -1; 717 } 718 719 // handle empty token 720 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 721 if (delimLen > 0) { 722 addToken(tokenList, StringUtils.EMPTY); 723 return start + delimLen; 724 } 725 726 // handle found token 727 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 728 if (quoteLen > 0) { 729 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 730 } 731 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 732 } 733 734 /** 735 * Reads a possibly quoted string token. 736 * 737 * @param srcChars the character array being tokenized. 738 * @param start the first character of field. 739 * @param len the length of the character array being tokenized. 740 * @param workArea a temporary work area. 741 * @param tokenList the list of parsed tokens. 742 * @param quoteStart the start position of the matched quote, 0 if no quoting. 743 * @param quoteLen the length of the matched quote, 0 if no quoting. 744 * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string. 745 */ 746 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 747 final List<String> tokenList, final int quoteStart, final int quoteLen) { 748 // Loop until we've found the end of the quoted 749 // string or the end of the input 750 workArea.clear(); 751 int pos = start; 752 boolean quoting = quoteLen > 0; 753 int trimStart = 0; 754 755 while (pos < len) { 756 // quoting mode can occur several times throughout a string 757 // we must switch between quoting and non-quoting until we 758 // encounter a non-quoted delimiter, or end of string 759 if (quoting) { 760 // In quoting mode 761 762 // If we've found a quote character, see if it's 763 // followed by a second quote. If so, then we need 764 // to actually put the quote character into the token 765 // rather than end the token. 766 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 767 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 768 // matched pair of quotes, thus an escaped quote 769 workArea.append(srcChars, pos, quoteLen); 770 pos += quoteLen * 2; 771 trimStart = workArea.size(); 772 continue; 773 } 774 775 // end of quoting 776 quoting = false; 777 pos += quoteLen; 778 continue; 779 } 780 781 } else { 782 // Not in quoting mode 783 784 // check for delimiter, and thus end of token 785 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 786 if (delimLen > 0) { 787 // return condition when end of token found 788 addToken(tokenList, workArea.substring(0, trimStart)); 789 return pos + delimLen; 790 } 791 792 // check for quote, and thus back into quoting mode 793 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 794 quoting = true; 795 pos += quoteLen; 796 continue; 797 } 798 799 // check for ignored (outside quotes), and ignore 800 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 801 if (ignoredLen > 0) { 802 pos += ignoredLen; 803 continue; 804 } 805 806 // check for trimmed character 807 // don't yet know if its at the end, so copy to workArea 808 // use trimStart to keep track of trim at the end 809 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 810 if (trimmedLen > 0) { 811 workArea.append(srcChars, pos, trimmedLen); 812 pos += trimmedLen; 813 continue; 814 } 815 816 } 817 // copy regular character from inside quotes 818 workArea.append(srcChars[pos++]); 819 trimStart = workArea.size(); 820 } 821 822 // return condition when end of string found 823 addToken(tokenList, workArea.substring(0, trimStart)); 824 return -1; 825 } 826 827 /** 828 * Unsupported ListIterator operation. 829 * 830 * @throws UnsupportedOperationException always. 831 */ 832 @Override 833 public void remove() { 834 throw new UnsupportedOperationException("remove() is unsupported"); 835 } 836 837 /** 838 * Resets this tokenizer, forgetting all parsing and iteration already completed. 839 * <p> 840 * This method allows the same tokenizer to be reused for the same String. 841 * </p> 842 * 843 * @return {@code this} instance. 844 */ 845 public StrTokenizer reset() { 846 tokenPos = 0; 847 tokens = null; 848 return this; 849 } 850 851 /** 852 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines. 853 * 854 * @param input the new character array to tokenize, not cloned, null sets no text to parse. 855 * @return {@code this} instance. 856 */ 857 public StrTokenizer reset(final char[] input) { 858 reset(); 859 if (input != null) { 860 this.chars = input.clone(); 861 } else { 862 this.chars = null; 863 } 864 return this; 865 } 866 867 /** 868 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines. 869 * 870 * @param input the new string to tokenize, null sets no text to parse. 871 * @return {@code this} instance. 872 */ 873 public StrTokenizer reset(final String input) { 874 reset(); 875 if (input != null) { 876 this.chars = input.toCharArray(); 877 } else { 878 this.chars = null; 879 } 880 return this; 881 } 882 883 /** 884 * Unsupported ListIterator operation. 885 * 886 * @param obj this parameter ignored. 887 * @throws UnsupportedOperationException Always thrown. 888 */ 889 @Override 890 public void set(final String obj) { 891 throw new UnsupportedOperationException("set() is unsupported"); 892 } 893 894 /** 895 * Sets the field delimiter character. 896 * 897 * @param delim the delimiter character to use. 898 * @return {@code this} instance. 899 */ 900 public StrTokenizer setDelimiterChar(final char delim) { 901 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 902 } 903 904 /** 905 * Sets the field delimiter matcher. 906 * <p> 907 * The delimiter is used to separate one token from another. 908 * </p> 909 * 910 * @param delim the delimiter matcher to use. 911 * @return {@code this} instance. 912 */ 913 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 914 if (delim == null) { 915 this.delimMatcher = StrMatcher.noneMatcher(); 916 } else { 917 this.delimMatcher = delim; 918 } 919 return this; 920 } 921 922 /** 923 * Sets the field delimiter string. 924 * 925 * @param delim the delimiter string to use. 926 * @return {@code this} instance. 927 */ 928 public StrTokenizer setDelimiterString(final String delim) { 929 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 930 } 931 932 /** 933 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false. 934 * 935 * @param emptyAsNull whether empty tokens are returned as null. 936 * @return {@code this} instance. 937 */ 938 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 939 this.emptyAsNull = emptyAsNull; 940 return this; 941 } 942 943 /** 944 * Sets the character to ignore. 945 * <p> 946 * This character is ignored when parsing the String, unless it is within a quoted region. 947 * </p> 948 * 949 * @param ignored the ignored character to use. 950 * @return {@code this} instance. 951 */ 952 public StrTokenizer setIgnoredChar(final char ignored) { 953 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 954 } 955 956 /** 957 * Sets the matcher for characters to ignore. 958 * <p> 959 * These characters are ignored when parsing the String, unless they are within a quoted region. 960 * </p> 961 * 962 * @param ignored the ignored matcher to use, null ignored. 963 * @return {@code this} instance. 964 */ 965 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 966 if (ignored != null) { 967 this.ignoredMatcher = ignored; 968 } 969 return this; 970 } 971 972 /** 973 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true. 974 * 975 * @param ignoreEmptyTokens whether empty tokens are not returned. 976 * @return {@code this} instance. 977 */ 978 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 979 this.ignoreEmptyTokens = ignoreEmptyTokens; 980 return this; 981 } 982 983 /** 984 * Sets the quote character to use. 985 * <p> 986 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 987 * </p> 988 * 989 * @param quote the quote character to use. 990 * @return {@code this} instance. 991 */ 992 public StrTokenizer setQuoteChar(final char quote) { 993 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 994 } 995 996 /** 997 * Sets the quote matcher to use. 998 * <p> 999 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 1000 * </p> 1001 * 1002 * @param quote the quote matcher to use, null ignored. 1003 * @return {@code this} instance. 1004 */ 1005 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 1006 if (quote != null) { 1007 this.quoteMatcher = quote; 1008 } 1009 return this; 1010 } 1011 1012 /** 1013 * Sets the matcher for characters to trim. 1014 * <p> 1015 * These characters are trimmed off on each side of the delimiter until the token or quote is found. 1016 * </p> 1017 * 1018 * @param trimmer the trimmer matcher to use, null ignored 1019 * @return {@code this} instance. 1020 */ 1021 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1022 if (trimmer != null) { 1023 this.trimmerMatcher = trimmer; 1024 } 1025 return this; 1026 } 1027 1028 /** 1029 * Gets the number of tokens found in the String. 1030 * 1031 * @return The number of matched tokens. 1032 */ 1033 public int size() { 1034 checkTokenized(); 1035 return tokens.length; 1036 } 1037 1038 /** 1039 * Internal method to performs the tokenization. 1040 * <p> 1041 * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required. 1042 * </p> 1043 * <p> 1044 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or 1045 * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results. 1046 * </p> 1047 * <p> 1048 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other 1049 * values, or even an entirely different array. 1050 * </p> 1051 * 1052 * @param srcChars the character array being tokenized, may be null. 1053 * @param offset the start position within the character array, must be valid. 1054 * @param count the number of characters to tokenize, must be valid. 1055 * @return The modifiable list of String tokens, unmodifiable if null array or zero count. 1056 */ 1057 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1058 if (srcChars == null || count == 0) { 1059 return Collections.emptyList(); 1060 } 1061 final StrBuilder buf = new StrBuilder(); 1062 final List<String> tokenList = new ArrayList<>(); 1063 int pos = offset; 1064 1065 // loop around the entire buffer 1066 while (pos >= 0 && pos < count) { 1067 // find next token 1068 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1069 1070 // handle case where end of string is a delimiter 1071 if (pos >= count) { 1072 addToken(tokenList, StringUtils.EMPTY); 1073 } 1074 } 1075 return tokenList; 1076 } 1077 1078 /** 1079 * Gets the String content that the tokenizer is parsing. 1080 * 1081 * @return The string content being parsed. 1082 */ 1083 @Override 1084 public String toString() { 1085 if (tokens == null) { 1086 return "StrTokenizer[not tokenized yet]"; 1087 } 1088 return "StrTokenizer" + getTokenList(); 1089 } 1090 1091}