001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025import java.util.StringTokenizer; 026 027import org.apache.commons.lang3.ArrayUtils; 028import org.apache.commons.lang3.StringUtils; 029 030/** 031 * Tokenizes a string based on delimiters (separators) 032 * and supporting quoting and ignored character concepts. 033 * <p> 034 * This class can split a String into many smaller strings. It aims 035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 036 * however it offers much more control and flexibility including implementing 037 * the {@link ListIterator} interface. By default, it is set up 038 * like {@link StringTokenizer}. 039 * </p> 040 * <p> 041 * The input String is split into a number of <em>tokens</em>. 042 * Each token is separated from the next String by a <em>delimiter</em>. 043 * One or more delimiter characters must be specified. 044 * </p> 045 * <p> 046 * Each token may be surrounded by quotes. 047 * The <em>quote</em> matcher specifies the quote character(s). 048 * A quote may be escaped within a quoted section by duplicating itself. 049 * </p> 050 * <p> 051 * Between each token and the delimiter are potentially characters that need trimming. 052 * The <em>trimmer</em> matcher specifies these characters. 053 * One usage might be to trim whitespace characters. 054 * </p> 055 * <p> 056 * At any point outside the quotes there might potentially be invalid characters. 057 * The <em>ignored</em> matcher specifies these characters to be removed. 058 * One usage might be to remove new line characters. 059 * </p> 060 * <p> 061 * Empty tokens may be removed or returned as null. 062 * </p> 063 * <pre> 064 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 065 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 067 * </pre> 068 * 069 * <table> 070 * <caption>StrTokenizer properties and options</caption> 071 * <tr> 072 * <th>Property</th><th>Type</th><th>Default</th> 073 * </tr> 074 * <tr> 075 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 076 * </tr> 077 * <tr> 078 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 079 * </tr> 080 * <tr> 081 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 082 * </tr> 083 * <tr> 084 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 085 * </tr> 086 * <tr> 087 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 088 * </tr> 089 * </table> 090 * 091 * @since 2.2 092 * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text 093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html"> 094 * StringTokenizer</a>. 095 */ 096@Deprecated 097public class StrTokenizer implements ListIterator<String>, Cloneable { 098 099 // @formatter:off 100 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 101 .setDelimiterMatcher(StrMatcher.commaMatcher()) 102 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 103 .setIgnoredMatcher(StrMatcher.noneMatcher()) 104 .setTrimmerMatcher(StrMatcher.trimMatcher()) 105 .setEmptyTokenAsNull(false) 106 .setIgnoreEmptyTokens(false); 107 // @formatter:on 108 109 // @formatter:off 110 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer() 111 .setDelimiterMatcher(StrMatcher.tabMatcher()) 112 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher()) 113 .setIgnoredMatcher(StrMatcher.noneMatcher()) 114 .setTrimmerMatcher(StrMatcher.trimMatcher()) 115 .setEmptyTokenAsNull(false) 116 .setIgnoreEmptyTokens(false); 117 // @formatter:on 118 119 /** 120 * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 121 * 122 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 123 */ 124 private static StrTokenizer getCSVClone() { 125 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 126 } 127 128 /** 129 * Gets a new tokenizer instance which parses Comma Separated Value strings 130 * initializing it with the given input. The default for CSV processing 131 * will be trim whitespace from both ends (which can be overridden with 132 * the setTrimmer method). 133 * <p> 134 * You must call a "reset" method to set the string which you want to parse. 135 * </p> 136 * 137 * @return a new tokenizer instance which parses Comma Separated Value strings. 138 */ 139 public static StrTokenizer getCSVInstance() { 140 return getCSVClone(); 141 } 142 143 /** 144 * Gets a new tokenizer instance which parses Comma Separated Value strings 145 * initializing it with the given input. The default for CSV processing 146 * will be trim whitespace from both ends (which can be overridden with 147 * the setTrimmer method). 148 * 149 * @param input the text to parse. 150 * @return a new tokenizer instance which parses Comma Separated Value strings. 151 */ 152 public static StrTokenizer getCSVInstance(final char[] input) { 153 final StrTokenizer tok = getCSVClone(); 154 tok.reset(input); 155 return tok; 156 } 157 158 /** 159 * Gets a new tokenizer instance which parses Comma Separated Value strings 160 * initializing it with the given input. The default for CSV processing 161 * will be trim whitespace from both ends (which can be overridden with 162 * the setTrimmer method). 163 * 164 * @param input the text to parse. 165 * @return a new tokenizer instance which parses Comma Separated Value strings. 166 */ 167 public static StrTokenizer getCSVInstance(final String input) { 168 final StrTokenizer tok = getCSVClone(); 169 tok.reset(input); 170 return tok; 171 } 172 173 /** 174 * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 175 * 176 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 177 */ 178 private static StrTokenizer getTSVClone() { 179 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 180 } 181 182 /** 183 * Gets a new tokenizer instance which parses Tab Separated Value strings. 184 * The default for CSV processing will be trim whitespace from both ends 185 * (which can be overridden with the setTrimmer method). 186 * <p> 187 * You must call a "reset" method to set the string which you want to parse. 188 * </p> 189 * 190 * @return a new tokenizer instance which parses Tab Separated Value strings. 191 */ 192 public static StrTokenizer getTSVInstance() { 193 return getTSVClone(); 194 } 195 196 /** 197 * Gets a new tokenizer instance which parses Tab Separated Value strings. 198 * The default for CSV processing will be trim whitespace from both ends 199 * (which can be overridden with the setTrimmer method). 200 * 201 * @param input the string to parse. 202 * @return a new tokenizer instance which parses Tab Separated Value strings. 203 */ 204 public static StrTokenizer getTSVInstance(final char[] input) { 205 final StrTokenizer tok = getTSVClone(); 206 tok.reset(input); 207 return tok; 208 } 209 210 /** 211 * Gets a new tokenizer instance which parses Tab Separated Value strings. 212 * The default for CSV processing will be trim whitespace from both ends 213 * (which can be overridden with the setTrimmer method). 214 * 215 * @param input the string to parse. 216 * @return a new tokenizer instance which parses Tab Separated Value strings. 217 */ 218 public static StrTokenizer getTSVInstance(final String input) { 219 final StrTokenizer tok = getTSVClone(); 220 tok.reset(input); 221 return tok; 222 } 223 224 /** The text to work on. */ 225 private char[] chars; 226 227 /** The parsed tokens */ 228 private String[] tokens; 229 230 /** The current iteration position */ 231 private int tokenPos; 232 233 /** The delimiter matcher */ 234 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 235 236 /** The quote matcher */ 237 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 238 239 /** The ignored matcher */ 240 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 241 242 /** The trimmer matcher */ 243 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 244 245 /** Whether to return empty tokens as null */ 246 private boolean emptyAsNull; 247 248 /** Whether to ignore empty tokens */ 249 private boolean ignoreEmptyTokens = true; 250 251 /** 252 * Constructs a tokenizer splitting on space, tab, newline and formfeed 253 * as per StringTokenizer, but with no text to tokenize. 254 * <p> 255 * This constructor is normally used with {@link #reset(String)}. 256 * </p> 257 */ 258 public StrTokenizer() { 259 this.chars = null; 260 } 261 262 /** 263 * Constructs a tokenizer splitting on space, tab, newline and formfeed 264 * as per StringTokenizer. 265 * 266 * @param input the string which is to be parsed, not cloned. 267 */ 268 public StrTokenizer(final char[] input) { 269 this.chars = ArrayUtils.clone(input); 270 } 271 272 /** 273 * Constructs a tokenizer splitting on the specified character. 274 * 275 * @param input the string which is to be parsed, not cloned. 276 * @param delim the field delimiter character. 277 */ 278 public StrTokenizer(final char[] input, final char delim) { 279 this(input); 280 setDelimiterChar(delim); 281 } 282 283 /** 284 * Constructs a tokenizer splitting on the specified delimiter character 285 * and handling quotes using the specified quote character. 286 * 287 * @param input the string which is to be parsed, not cloned. 288 * @param delim the field delimiter character. 289 * @param quote the field quoted string character. 290 */ 291 public StrTokenizer(final char[] input, final char delim, final char quote) { 292 this(input, delim); 293 setQuoteChar(quote); 294 } 295 296 /** 297 * Constructs a tokenizer splitting on the specified string. 298 * 299 * @param input the string which is to be parsed, not cloned. 300 * @param delim the field delimiter string. 301 */ 302 public StrTokenizer(final char[] input, final String delim) { 303 this(input); 304 setDelimiterString(delim); 305 } 306 307 /** 308 * Constructs a tokenizer splitting using the specified delimiter matcher. 309 * 310 * @param input the string which is to be parsed, not cloned. 311 * @param delim the field delimiter matcher. 312 */ 313 public StrTokenizer(final char[] input, final StrMatcher delim) { 314 this(input); 315 setDelimiterMatcher(delim); 316 } 317 318 /** 319 * Constructs a tokenizer splitting using the specified delimiter matcher 320 * and handling quotes using the specified quote matcher. 321 * 322 * @param input the string which is to be parsed, not cloned. 323 * @param delim the field delimiter character. 324 * @param quote the field quoted string character. 325 */ 326 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 327 this(input, delim); 328 setQuoteMatcher(quote); 329 } 330 331 /** 332 * Constructs a tokenizer splitting on space, tab, newline and formfeed 333 * as per StringTokenizer. 334 * 335 * @param input the string which is to be parsed. 336 */ 337 public StrTokenizer(final String input) { 338 if (input != null) { 339 chars = input.toCharArray(); 340 } else { 341 chars = null; 342 } 343 } 344 345 /** 346 * Constructs a tokenizer splitting on the specified delimiter character. 347 * 348 * @param input the string which is to be parsed. 349 * @param delim the field delimiter character. 350 */ 351 public StrTokenizer(final String input, final char delim) { 352 this(input); 353 setDelimiterChar(delim); 354 } 355 356 /** 357 * Constructs a tokenizer splitting on the specified delimiter character 358 * and handling quotes using the specified quote character. 359 * 360 * @param input the string which is to be parsed. 361 * @param delim the field delimiter character. 362 * @param quote the field quoted string character. 363 */ 364 public StrTokenizer(final String input, final char delim, final char quote) { 365 this(input, delim); 366 setQuoteChar(quote); 367 } 368 369 /** 370 * Constructs a tokenizer splitting on the specified delimiter string. 371 * 372 * @param input the string which is to be parsed. 373 * @param delim the field delimiter string. 374 */ 375 public StrTokenizer(final String input, final String delim) { 376 this(input); 377 setDelimiterString(delim); 378 } 379 380 /** 381 * Constructs a tokenizer splitting using the specified delimiter matcher. 382 * 383 * @param input the string which is to be parsed. 384 * @param delim the field delimiter matcher. 385 */ 386 public StrTokenizer(final String input, final StrMatcher delim) { 387 this(input); 388 setDelimiterMatcher(delim); 389 } 390 391 /** 392 * Constructs a tokenizer splitting using the specified delimiter matcher 393 * and handling quotes using the specified quote matcher. 394 * 395 * @param input the string which is to be parsed. 396 * @param delim the field delimiter matcher. 397 * @param quote the field quoted string matcher. 398 */ 399 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 400 this(input, delim); 401 setQuoteMatcher(quote); 402 } 403 404 /** 405 * Unsupported ListIterator operation. 406 * 407 * @param obj this parameter ignored. 408 * @throws UnsupportedOperationException always. 409 */ 410 @Override 411 public void add(final String obj) { 412 throw new UnsupportedOperationException("add() is unsupported"); 413 } 414 415 /** 416 * Adds a token to a list, paying attention to the parameters we've set. 417 * 418 * @param list the list to add to. 419 * @param tok the token to add. 420 */ 421 private void addToken(final List<String> list, String tok) { 422 if (StringUtils.isEmpty(tok)) { 423 if (isIgnoreEmptyTokens()) { 424 return; 425 } 426 if (isEmptyTokenAsNull()) { 427 tok = null; 428 } 429 } 430 list.add(tok); 431 } 432 433 /** 434 * Checks if tokenization has been done, and if not then do it. 435 */ 436 private void checkTokenized() { 437 if (tokens == null) { 438 if (chars == null) { 439 // still call tokenize as subclass may do some work 440 final List<String> split = tokenize(null, 0, 0); 441 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 442 } else { 443 final List<String> split = tokenize(chars, 0, chars.length); 444 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 445 } 446 } 447 } 448 449 /** 450 * Creates a new instance of this Tokenizer. The new instance is reset so 451 * that it will be at the start of the token list. 452 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 453 * 454 * @return a new instance of this Tokenizer which has been reset. 455 */ 456 @Override 457 public Object clone() { 458 try { 459 return cloneReset(); 460 } catch (final CloneNotSupportedException ex) { 461 return null; 462 } 463 } 464 465 /** 466 * Creates a new instance of this Tokenizer. The new instance is reset so that 467 * it will be at the start of the token list. 468 * 469 * @return a new instance of this Tokenizer which has been reset. 470 * @throws CloneNotSupportedException if there is a problem cloning. 471 */ 472 Object cloneReset() throws CloneNotSupportedException { 473 // this method exists to enable 100% test coverage 474 final StrTokenizer cloned = (StrTokenizer) super.clone(); 475 if (cloned.chars != null) { 476 cloned.chars = cloned.chars.clone(); 477 } 478 cloned.reset(); 479 return cloned; 480 } 481 482 /** 483 * Gets the String content that the tokenizer is parsing. 484 * 485 * @return the string content being parsed. 486 */ 487 public String getContent() { 488 if (chars == null) { 489 return null; 490 } 491 return new String(chars); 492 } 493 494 /** 495 * Gets the field delimiter matcher. 496 * 497 * @return the delimiter matcher in use. 498 */ 499 public StrMatcher getDelimiterMatcher() { 500 return this.delimMatcher; 501 } 502 503 /** 504 * Gets the ignored character matcher. 505 * <p> 506 * These characters are ignored when parsing the String, unless they are 507 * within a quoted region. 508 * The default value is not to ignore anything. 509 * </p> 510 * 511 * @return the ignored matcher in use. 512 */ 513 public StrMatcher getIgnoredMatcher() { 514 return ignoredMatcher; 515 } 516 517 /** 518 * Gets the quote matcher currently in use. 519 * <p> 520 * The quote character is used to wrap data between the tokens. 521 * This enables delimiters to be entered as data. 522 * The default value is '"' (double quote). 523 * </p> 524 * 525 * @return the quote matcher in use. 526 */ 527 public StrMatcher getQuoteMatcher() { 528 return quoteMatcher; 529 } 530 531 /** 532 * Gets a copy of the full token list as an independent modifiable array. 533 * 534 * @return the tokens as a String array. 535 */ 536 public String[] getTokenArray() { 537 checkTokenized(); 538 return tokens.clone(); 539 } 540 541 /** 542 * Gets a copy of the full token list as an independent modifiable list. 543 * 544 * @return the tokens as a String array. 545 */ 546 public List<String> getTokenList() { 547 checkTokenized(); 548 final List<String> list = new ArrayList<>(tokens.length); 549 list.addAll(Arrays.asList(tokens)); 550 return list; 551 } 552 553 /** 554 * Gets the trimmer character matcher. 555 * <p> 556 * These characters are trimmed off on each side of the delimiter 557 * until the token or quote is found. 558 * The default value is not to trim anything. 559 * </p> 560 * 561 * @return the trimmer matcher in use. 562 */ 563 public StrMatcher getTrimmerMatcher() { 564 return trimmerMatcher; 565 } 566 567 /** 568 * Checks whether there are any more tokens. 569 * 570 * @return true if there are more tokens. 571 */ 572 @Override 573 public boolean hasNext() { 574 checkTokenized(); 575 return tokenPos < tokens.length; 576 } 577 578 /** 579 * Checks whether there are any previous tokens that can be iterated to. 580 * 581 * @return true if there are previous tokens. 582 */ 583 @Override 584 public boolean hasPrevious() { 585 checkTokenized(); 586 return tokenPos > 0; 587 } 588 589 /** 590 * Gets whether the tokenizer currently returns empty tokens as null. 591 * The default for this property is false. 592 * 593 * @return true if empty tokens are returned as null. 594 */ 595 public boolean isEmptyTokenAsNull() { 596 return this.emptyAsNull; 597 } 598 599 /** 600 * Gets whether the tokenizer currently ignores empty tokens. 601 * The default for this property is true. 602 * 603 * @return true if empty tokens are not returned. 604 */ 605 public boolean isIgnoreEmptyTokens() { 606 return ignoreEmptyTokens; 607 } 608 609 /** 610 * Checks if the characters at the index specified match the quote 611 * already matched in readNextToken(). 612 * 613 * @param srcChars the character array being tokenized. 614 * @param pos the position to check for a quote. 615 * @param len the length of the character array being tokenized. 616 * @param quoteStart the start position of the matched quote, 0 if no quoting. 617 * @param quoteLen the length of the matched quote, 0 if no quoting. 618 * @return true if a quote is matched. 619 */ 620 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 621 for (int i = 0; i < quoteLen; i++) { 622 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 623 return false; 624 } 625 } 626 return true; 627 } 628 629 /** 630 * Gets the next token. 631 * 632 * @return the next String token. 633 * @throws NoSuchElementException if there are no more elements. 634 */ 635 @Override 636 public String next() { 637 if (hasNext()) { 638 return tokens[tokenPos++]; 639 } 640 throw new NoSuchElementException(); 641 } 642 643 /** 644 * Gets the index of the next token to return. 645 * 646 * @return the next token index. 647 */ 648 @Override 649 public int nextIndex() { 650 return tokenPos; 651 } 652 653 /** 654 * Gets the next token from the String. 655 * Equivalent to {@link #next()} except it returns null rather than 656 * throwing {@link NoSuchElementException} when no tokens remain. 657 * 658 * @return the next sequential token, or null when no more tokens are found. 659 */ 660 public String nextToken() { 661 if (hasNext()) { 662 return tokens[tokenPos++]; 663 } 664 return null; 665 } 666 667 /** 668 * Gets the token previous to the last returned token. 669 * 670 * @return the previous token. 671 */ 672 @Override 673 public String previous() { 674 if (hasPrevious()) { 675 return tokens[--tokenPos]; 676 } 677 throw new NoSuchElementException(); 678 } 679 680 /** 681 * Gets the index of the previous token. 682 * 683 * @return the previous token index. 684 */ 685 @Override 686 public int previousIndex() { 687 return tokenPos - 1; 688 } 689 690 /** 691 * Gets the previous token from the String. 692 * 693 * @return the previous sequential token, or null when no more tokens are found. 694 */ 695 public String previousToken() { 696 if (hasPrevious()) { 697 return tokens[--tokenPos]; 698 } 699 return null; 700 } 701 702 /** 703 * Reads character by character through the String to get the next token. 704 * 705 * @param srcChars the character array being tokenized. 706 * @param start the first character of field. 707 * @param len the length of the character array being tokenized. 708 * @param workArea a temporary work area. 709 * @param tokenList the list of parsed tokens. 710 * @return the starting position of the next field (the character 711 * immediately after the delimiter), or -1 if end of string found. 712 */ 713 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 714 // skip all leading whitespace, unless it is the 715 // field delimiter or the quote character 716 while (start < len) { 717 final int removeLen = Math.max( 718 getIgnoredMatcher().isMatch(srcChars, start, start, len), 719 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 720 if (removeLen == 0 || 721 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 722 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 723 break; 724 } 725 start += removeLen; 726 } 727 728 // handle reaching end 729 if (start >= len) { 730 addToken(tokenList, StringUtils.EMPTY); 731 return -1; 732 } 733 734 // handle empty token 735 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 736 if (delimLen > 0) { 737 addToken(tokenList, StringUtils.EMPTY); 738 return start + delimLen; 739 } 740 741 // handle found token 742 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 743 if (quoteLen > 0) { 744 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 745 } 746 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 747 } 748 749 /** 750 * Reads a possibly quoted string token. 751 * 752 * @param srcChars the character array being tokenized. 753 * @param start the first character of field. 754 * @param len the length of the character array being tokenized. 755 * @param workArea a temporary work area. 756 * @param tokenList the list of parsed tokens. 757 * @param quoteStart the start position of the matched quote, 0 if no quoting. 758 * @param quoteLen the length of the matched quote, 0 if no quoting. 759 * @return the starting position of the next field (the character 760 * immediately after the delimiter, or if end of string found, 761 * then the length of string. 762 */ 763 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 764 final List<String> tokenList, final int quoteStart, final int quoteLen) { 765 // Loop until we've found the end of the quoted 766 // string or the end of the input 767 workArea.clear(); 768 int pos = start; 769 boolean quoting = quoteLen > 0; 770 int trimStart = 0; 771 772 while (pos < len) { 773 // quoting mode can occur several times throughout a string 774 // we must switch between quoting and non-quoting until we 775 // encounter a non-quoted delimiter, or end of string 776 if (quoting) { 777 // In quoting mode 778 779 // If we've found a quote character, see if it's 780 // followed by a second quote. If so, then we need 781 // to actually put the quote character into the token 782 // rather than end the token. 783 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 784 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 785 // matched pair of quotes, thus an escaped quote 786 workArea.append(srcChars, pos, quoteLen); 787 pos += quoteLen * 2; 788 trimStart = workArea.size(); 789 continue; 790 } 791 792 // end of quoting 793 quoting = false; 794 pos += quoteLen; 795 continue; 796 } 797 798 } else { 799 // Not in quoting mode 800 801 // check for delimiter, and thus end of token 802 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 803 if (delimLen > 0) { 804 // return condition when end of token found 805 addToken(tokenList, workArea.substring(0, trimStart)); 806 return pos + delimLen; 807 } 808 809 // check for quote, and thus back into quoting mode 810 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 811 quoting = true; 812 pos += quoteLen; 813 continue; 814 } 815 816 // check for ignored (outside quotes), and ignore 817 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 818 if (ignoredLen > 0) { 819 pos += ignoredLen; 820 continue; 821 } 822 823 // check for trimmed character 824 // don't yet know if it's at the end, so copy to workArea 825 // use trimStart to keep track of trim at the end 826 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 827 if (trimmedLen > 0) { 828 workArea.append(srcChars, pos, trimmedLen); 829 pos += trimmedLen; 830 continue; 831 } 832 } 833 // copy regular character from inside quotes 834 workArea.append(srcChars[pos++]); 835 trimStart = workArea.size(); 836 } 837 838 // return condition when end of string found 839 addToken(tokenList, workArea.substring(0, trimStart)); 840 return -1; 841 } 842 843 /** 844 * Unsupported ListIterator operation. 845 * 846 * @throws UnsupportedOperationException always. 847 */ 848 @Override 849 public void remove() { 850 throw new UnsupportedOperationException("remove() is unsupported"); 851 } 852 853 /** 854 * Resets this tokenizer, forgetting all parsing and iteration already completed. 855 * <p> 856 * This method allows the same tokenizer to be reused for the same String. 857 * </p> 858 * 859 * @return {@code this} instance. 860 */ 861 public StrTokenizer reset() { 862 tokenPos = 0; 863 tokens = null; 864 return this; 865 } 866 867 /** 868 * Reset this tokenizer, giving it a new input string to parse. 869 * In this manner you can re-use a tokenizer with the same settings 870 * on multiple input lines. 871 * 872 * @param input the new character array to tokenize, not cloned, null sets no text to parse. 873 * @return {@code this} instance. 874 */ 875 public StrTokenizer reset(final char[] input) { 876 reset(); 877 this.chars = ArrayUtils.clone(input); 878 return this; 879 } 880 881 /** 882 * Reset this tokenizer, giving it a new input string to parse. 883 * In this manner you can re-use a tokenizer with the same settings 884 * on multiple input lines. 885 * 886 * @param input the new string to tokenize, null sets no text to parse. 887 * @return {@code this} instance. 888 */ 889 public StrTokenizer reset(final String input) { 890 reset(); 891 if (input != null) { 892 this.chars = input.toCharArray(); 893 } else { 894 this.chars = null; 895 } 896 return this; 897 } 898 899 /** 900 * Unsupported ListIterator operation. 901 * 902 * @param obj this parameter ignored. 903 * @throws UnsupportedOperationException always. 904 */ 905 @Override 906 public void set(final String obj) { 907 throw new UnsupportedOperationException("set() is unsupported"); 908 } 909 910 /** 911 * Sets the field delimiter character. 912 * 913 * @param delim the delimiter character to use. 914 * @return {@code this} instance. 915 */ 916 public StrTokenizer setDelimiterChar(final char delim) { 917 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 918 } 919 920 /** 921 * Sets the field delimiter matcher. 922 * <p> 923 * The delimiter is used to separate one token from another. 924 * </p> 925 * 926 * @param delim the delimiter matcher to use. 927 * @return {@code this} instance. 928 */ 929 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 930 if (delim == null) { 931 this.delimMatcher = StrMatcher.noneMatcher(); 932 } else { 933 this.delimMatcher = delim; 934 } 935 return this; 936 } 937 938 /** 939 * Sets the field delimiter string. 940 * 941 * @param delim the delimiter string to use. 942 * @return {@code this} instance. 943 */ 944 public StrTokenizer setDelimiterString(final String delim) { 945 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 946 } 947 948 /** 949 * Sets whether the tokenizer should return empty tokens as null. 950 * The default for this property is false. 951 * 952 * @param emptyAsNull whether empty tokens are returned as null. 953 * @return {@code this} instance. 954 */ 955 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 956 this.emptyAsNull = emptyAsNull; 957 return this; 958 } 959 960 /** 961 * Sets the character to ignore. 962 * <p> 963 * This character is ignored when parsing the String, unless it is 964 * within a quoted region. 965 * 966 * @param ignored the ignored character to use. 967 * @return {@code this} instance. 968 */ 969 public StrTokenizer setIgnoredChar(final char ignored) { 970 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 971 } 972 973 /** 974 * Sets the matcher for characters to ignore. 975 * <p> 976 * These characters are ignored when parsing the String, unless they are 977 * within a quoted region. 978 * </p> 979 * 980 * @param ignored the ignored matcher to use, null ignored. 981 * @return {@code this} instance. 982 */ 983 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 984 if (ignored != null) { 985 this.ignoredMatcher = ignored; 986 } 987 return this; 988 } 989 990 /** 991 * Sets whether the tokenizer should ignore and not return empty tokens. 992 * The default for this property is true. 993 * 994 * @param ignoreEmptyTokens whether empty tokens are not returned. 995 * @return {@code this} instance. 996 */ 997 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 998 this.ignoreEmptyTokens = ignoreEmptyTokens; 999 return this; 1000 } 1001 1002 /** 1003 * Sets the quote character to use. 1004 * <p> 1005 * The quote character is used to wrap data between the tokens. 1006 * This enables delimiters to be entered as data. 1007 * </p> 1008 * 1009 * @param quote the quote character to use. 1010 * @return {@code this} instance. 1011 */ 1012 public StrTokenizer setQuoteChar(final char quote) { 1013 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 1014 } 1015 1016 /** 1017 * Sets the quote matcher to use. 1018 * <p> 1019 * The quote character is used to wrap data between the tokens. 1020 * This enables delimiters to be entered as data. 1021 * </p> 1022 * 1023 * @param quote the quote matcher to use, null ignored. 1024 * @return {@code this} instance. 1025 */ 1026 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 1027 if (quote != null) { 1028 this.quoteMatcher = quote; 1029 } 1030 return this; 1031 } 1032 1033 /** 1034 * Sets the matcher for characters to trim. 1035 * <p> 1036 * These characters are trimmed off on each side of the delimiter 1037 * until the token or quote is found. 1038 * </p> 1039 * 1040 * @param trimmer the trimmer matcher to use, null ignored. 1041 * @return {@code this} instance. 1042 */ 1043 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1044 if (trimmer != null) { 1045 this.trimmerMatcher = trimmer; 1046 } 1047 return this; 1048 } 1049 1050 /** 1051 * Gets the number of tokens found in the String. 1052 * 1053 * @return the number of matched tokens. 1054 */ 1055 public int size() { 1056 checkTokenized(); 1057 return tokens.length; 1058 } 1059 1060 /** 1061 * Internal method to performs the tokenization. 1062 * <p> 1063 * Most users of this class do not need to call this method. This method 1064 * will be called automatically by other (public) methods when required. 1065 * </p> 1066 * <p> 1067 * This method exists to allow subclasses to add code before or after the 1068 * tokenization. For example, a subclass could alter the character array, 1069 * offset or count to be parsed, or call the tokenizer multiple times on 1070 * multiple strings. It is also be possible to filter the results. 1071 * </p> 1072 * <p> 1073 * {@link StrTokenizer} will always pass a zero offset and a count 1074 * equal to the length of the array to this method, however a subclass 1075 * may pass other values, or even an entirely different array. 1076 * </p> 1077 * 1078 * @param srcChars the character array being tokenized, may be null. 1079 * @param offset the start position within the character array, must be valid. 1080 * @param count the number of characters to tokenize, must be valid. 1081 * @return the modifiable list of String tokens, unmodifiable if null array or zero count. 1082 */ 1083 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1084 if (ArrayUtils.isEmpty(srcChars)) { 1085 return Collections.emptyList(); 1086 } 1087 final StrBuilder buf = new StrBuilder(); 1088 final List<String> tokenList = new ArrayList<>(); 1089 int pos = offset; 1090 1091 // loop around the entire buffer 1092 while (pos >= 0 && pos < count) { 1093 // find next token 1094 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1095 1096 // handle case where end of string is a delimiter 1097 if (pos >= count) { 1098 addToken(tokenList, StringUtils.EMPTY); 1099 } 1100 } 1101 return tokenList; 1102 } 1103 1104 /** 1105 * Gets the String content that the tokenizer is parsing. 1106 * 1107 * @return the string content being parsed. 1108 */ 1109 @Override 1110 public String toString() { 1111 if (tokens == null) { 1112 return "StrTokenizer[not tokenized yet]"; 1113 } 1114 return "StrTokenizer" + getTokenList(); 1115 } 1116 1117}