001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.validator.routines; 018 019import java.io.Serializable; 020import java.net.URI; 021import java.net.URISyntaxException; 022import java.util.Collections; 023import java.util.HashSet; 024import java.util.Locale; 025import java.util.Set; 026import java.util.regex.Matcher; 027import java.util.regex.Pattern; 028 029import org.apache.commons.validator.GenericValidator; 030 031/** 032 * <p><strong>URL Validation</strong> routines.</p> 033 * Behavior of validation is modified by passing in options: 034 * <ul> 035 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 036 * component.</li> 037 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 038 * included then fragments are flagged as illegal.</li> 039 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 040 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 041 * </ul> 042 * 043 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 044 * https://javascript.internet.com. However, this validation now bears little resemblance 045 * to the php original.</p> 046 * <pre> 047 * Example of usage: 048 * Construct a UrlValidator with valid schemes of "http", and "https". 049 * 050 * String[] schemes = {"http","https"}. 051 * UrlValidator urlValidator = new UrlValidator(schemes); 052 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 053 * System.out.println("URL is valid"); 054 * } else { 055 * System.out.println("URL is invalid"); 056 * } 057 * 058 * prints "URL is invalid" 059 * If instead the default constructor is used. 060 * 061 * UrlValidator urlValidator = new UrlValidator(); 062 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 063 * System.out.println("URL is valid"); 064 * } else { 065 * System.out.println("URL is invalid"); 066 * } 067 * 068 * prints out "URL is valid" 069 * </pre> 070 * 071 * @see 072 * <a href="https://www.ietf.org/rfc/rfc2396.txt"> 073 * Uniform Resource Identifiers (URI): Generic Syntax 074 * </a> 075 * 076 * @since 1.4 077 */ 078public class UrlValidator implements Serializable { 079 080 private static final long serialVersionUID = 7557161713937335013L; 081 082 private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max 083 084 /** 085 * Allows all validly formatted schemes to pass validation instead of 086 * supplying a set of valid schemes. 087 */ 088 public static final long ALLOW_ALL_SCHEMES = 1 << 0; 089 090 /** 091 * Allow two slashes in the path component of the URL. 092 */ 093 public static final long ALLOW_2_SLASHES = 1 << 1; 094 095 /** 096 * Enabling this options disallows any URL fragments. 097 */ 098 public static final long NO_FRAGMENTS = 1 << 2; 099 100 /** 101 * Allow local URLs, such as https://localhost/ or https://machine/ . 102 * This enables a broad-brush check, for complex local machine name 103 * validation requirements you should create your validator with 104 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)}) 105 */ 106 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber 107 108 /** 109 * Protocol scheme (for example, http, ftp, https). 110 */ 111 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"; 112 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX); 113 114 // Drop numeric, and "+-." for now 115 // TODO does not allow for optional userinfo. 116 // Validation of character set is done by isValidAuthority 117 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6 118 // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123 119 private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix 120 121 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 122 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 123 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 124 // We assume that password has the same valid chars as user info 125 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]"; 126 127 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching 128 private static final String USERINFO_FIELD_REGEX = 129 USERINFO_CHARS_REGEX + "+" + // At least one character for the name 130 "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent 131 132 private static final String AUTHORITY_REGEX = 133 "(?:\\[(" + IPV6_REGEX + ")\\]|(?:(?:" + USERINFO_FIELD_REGEX + ")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?"; 134 // 1 for example, user:pass@ 2 3 4 135 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 136 137 private static final int PARSE_AUTHORITY_IPV6 = 1; 138 139 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present 140 141 private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon 142 143 /** 144 * Should always be empty. The code currently allows spaces. 145 */ 146 private static final int PARSE_AUTHORITY_EXTRA = 4; 147 148 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"; 149 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX); 150 151 private static final String QUERY_REGEX = "^(\\S*)$"; 152 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX); 153 154 /** 155 * If no schemes are provided, default to this set. 156 */ 157 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case 158 159 /** 160 * Singleton instance of this class with default schemes and options. 161 */ 162 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator(); 163 164 /** 165 * Returns the singleton instance of this class with default schemes and options. 166 * 167 * @return singleton instance with default schemes and options 168 */ 169 public static UrlValidator getInstance() { 170 return DEFAULT_URL_VALIDATOR; 171 } 172 173 /** 174 * Tests whether the given flag is on. If the flag is not a power of 2 175 * (for example, 3) this tests whether the combination of flags is on. 176 * 177 * @param flag Flag value to check. 178 * @param options what to check 179 * @return whether the specified flag value is on. 180 */ 181 private static boolean isOn(final long flag, final long options) { 182 return (options & flag) > 0; 183 } 184 185 /** 186 * Holds the set of current validation options. 187 */ 188 private final long options; 189 190 /** 191 * The set of schemes that are allowed to be in a URL. 192 */ 193 private final Set<String> allowedSchemes; // Must be lower-case 194 195 /** 196 * Regular expressions used to manually validate authorities if IANA 197 * domain name validation isn't desired. 198 */ 199 private final RegexValidator authorityValidator; 200 201 /** 202 * The domain validator. 203 */ 204 private final DomainValidator domainValidator; 205 206 /** 207 * Constructs a new instance with default properties. 208 */ 209 public UrlValidator() { 210 this(null); 211 } 212 213 /** 214 * Constructs a new instance with the given validation options. 215 * 216 * @param options The options should be set using the public constants declared in 217 * this class. To set multiple options you simply add them together. For example, 218 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 219 */ 220 public UrlValidator(final long options) { 221 this(null, null, options); 222 } 223 224 /** 225 * Constructs a new instance with the given validation options. 226 * 227 * @param authorityValidator Regular expression validator used to validate the authority part 228 * This allows the user to override the standard set of domains. 229 * @param options Validation options. Set using the public constants of this class. 230 * To set multiple options, simply add them together: 231 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p> 232 * enables both of those options. 233 */ 234 public UrlValidator(final RegexValidator authorityValidator, final long options) { 235 this(null, authorityValidator, options); 236 } 237 238 /** 239 * Behavior of validation is modified by passing in several strings options: 240 * 241 * @param schemes Pass in one or more URL schemes to consider valid, passing in 242 * a null will default to "http,https,ftp" being valid. 243 * If a non-null schemes is specified then all valid schemes must 244 * be specified. Setting the ALLOW_ALL_SCHEMES option will 245 * ignore the contents of schemes. 246 */ 247 public UrlValidator(final String[] schemes) { 248 this(schemes, 0L); 249 } 250 251 /** 252 * Behavior of validation is modified by passing in options: 253 * 254 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 255 * @param options The options should be set using the public constants declared in 256 * this class. To set multiple options you simply add them together. For example, 257 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 258 */ 259 public UrlValidator(final String[] schemes, final long options) { 260 this(schemes, null, options); 261 } 262 263 /** 264 * Customizable constructor. Validation behavior is modified by passing in options. 265 * 266 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 267 * @param authorityValidator Regular expression validator used to validate the authority part 268 * @param options Validation options. Set using the public constants of this class. 269 * To set multiple options, simply add them together: 270 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p> 271 * enables both of those options. 272 */ 273 public UrlValidator(final String[] schemes, final RegexValidator authorityValidator, final long options) { 274 this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options))); 275 } 276 277 /** 278 * Customizable constructor. Validation behavior is modified by passing in options. 279 * 280 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 281 * @param authorityValidator Regular expression validator used to validate the authority part 282 * @param options Validation options. Set using the public constants of this class. 283 * To set multiple options, simply add them together: 284 * <p>{@code ALLOW_2_SLASHES + NO_FRAGMENTS}</p> 285 * enables both of those options. 286 * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting 287 * @since 1.7 288 */ 289 public UrlValidator(String[] schemes, final RegexValidator authorityValidator, final long options, final DomainValidator domainValidator) { 290 this.options = options; 291 if (domainValidator == null) { 292 throw new IllegalArgumentException("DomainValidator must not be null"); 293 } 294 if (domainValidator.isAllowLocal() != (options & ALLOW_LOCAL_URLS) > 0) { 295 throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting"); 296 } 297 this.domainValidator = domainValidator; 298 299 if (isOn(ALLOW_ALL_SCHEMES)) { 300 allowedSchemes = Collections.emptySet(); 301 } else { 302 if (schemes == null) { 303 schemes = DEFAULT_SCHEMES; 304 } 305 allowedSchemes = new HashSet<>(schemes.length); 306 for (final String scheme : schemes) { 307 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH)); 308 } 309 } 310 311 this.authorityValidator = authorityValidator; 312 } 313 314 /** 315 * Returns the number of times the token appears in the target. 316 * 317 * @param token Token value to be counted. 318 * @param target Target value to count tokens in. 319 * @return the number of tokens. 320 */ 321 protected int countToken(final String token, final String target) { 322 int tokenIndex = 0; 323 int count = 0; 324 while (tokenIndex != -1) { 325 tokenIndex = target.indexOf(token, tokenIndex); 326 if (tokenIndex > -1) { 327 tokenIndex++; 328 count++; 329 } 330 } 331 return count; 332 } 333 334 /** 335 * Tests whether the given flag is off. If the flag is not a power of 2 336 * (for example, 3) this tests whether the combination of flags is off. 337 * 338 * @param flag Flag value to check. 339 * @return whether the specified flag value is off. 340 */ 341 private boolean isOff(final long flag) { 342 return (options & flag) == 0; 343 } 344 345 /** 346 * Tests whether the given flag is on. If the flag is not a power of 2 347 * (for example, 3) this tests whether the combination of flags is on. 348 * 349 * @param flag Flag value to check. 350 * @return whether the specified flag value is on. 351 */ 352 private boolean isOn(final long flag) { 353 return (options & flag) > 0; 354 } 355 356 /** 357 * <p>Checks if a field has a valid URL address.</p> 358 * 359 * Note that the method calls #isValidAuthority() 360 * which checks that the domain is valid. 361 * 362 * @param value The value validation is being performed on. A {@code null} 363 * value is considered invalid. 364 * @return true if the URL is valid. 365 */ 366 public boolean isValid(final String value) { 367 if (value == null) { 368 return false; 369 } 370 final URI uri; // ensure value is a valid URI 371 try { 372 uri = new URI(value); 373 } catch (final URISyntaxException e) { 374 return false; 375 } 376 // OK, perform additional validation 377 final String scheme = uri.getScheme(); 378 if (!isValidScheme(scheme)) { 379 return false; 380 } 381 final String authority = uri.getRawAuthority(); 382 if ("file".equals(scheme) && GenericValidator.isBlankOrNull(authority)) { // Special case - file: allows an empty authority 383 return true; // this is a local file - nothing more to do here 384 } 385 // Validate the authority 386 if ("file".equals(scheme) && authority != null && authority.contains(":") || !isValidAuthority(authority)) { 387 return false; 388 } 389 if (!isValidPath(uri.getRawPath()) || !isValidQuery(uri.getRawQuery()) || !isValidFragment(uri.getRawFragment())) { 390 return false; 391 } 392 return true; 393 } 394 395 /** 396 * Returns true if the authority is properly formatted. An authority is the combination 397 * of hostname and port. A {@code null} authority value is considered invalid. 398 * Note: this implementation validates the domain unless a RegexValidator was provided. 399 * If a RegexValidator was supplied, and it matches, then the authority is regarded 400 * as valid with no further checks, otherwise the method checks against the 401 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS) 402 * 403 * @param authority Authority value to validate, allows IDN 404 * @return true if authority (hostname and port) is valid. 405 */ 406 protected boolean isValidAuthority(final String authority) { 407 if (authority == null) { 408 return false; 409 } 410 411 // check manual authority validation if specified 412 if (authorityValidator != null && authorityValidator.isValid(authority)) { 413 return true; 414 } 415 // convert to ASCII if possible 416 final String authorityASCII = DomainValidator.unicodeToASCII(authority); 417 418 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII); 419 if (!authorityMatcher.matches()) { 420 return false; 421 } 422 423 // We have to process IPV6 separately because that is parsed in a different group 424 final String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6); 425 if (ipv6 != null) { 426 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 427 if (!inetAddressValidator.isValidInet6Address(ipv6)) { 428 return false; 429 } 430 } else { 431 final String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 432 // check if authority is hostname or IP address: 433 // try a hostname first since that's much more likely 434 if (!domainValidator.isValid(hostLocation)) { 435 // try an IPv4 address 436 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 437 if (!inetAddressValidator.isValidInet4Address(hostLocation)) { 438 // isn't IPv4, so the URL is invalid 439 return false; 440 } 441 } 442 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 443 if (!GenericValidator.isBlankOrNull(port)) { 444 try { 445 final int iPort = Integer.parseInt(port); 446 if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) { 447 return false; 448 } 449 } catch (final NumberFormatException nfe) { 450 return false; // this can happen for big numbers 451 } 452 } 453 } 454 455 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 456 if (extra != null && !extra.trim().isEmpty()) { 457 return false; 458 } 459 460 return true; 461 } 462 463 /** 464 * Returns true if the given fragment is null or fragments are allowed. 465 * 466 * @param fragment Fragment value to validate. 467 * @return true if fragment is valid. 468 */ 469 protected boolean isValidFragment(final String fragment) { 470 if (fragment == null) { 471 return true; 472 } 473 474 return isOff(NO_FRAGMENTS); 475 } 476 477 /** 478 * Returns true if the path is valid. A {@code null} value is considered invalid. 479 * 480 * @param path Path value to validate. 481 * @return true if path is valid. 482 */ 483 protected boolean isValidPath(final String path) { 484 if (path == null || !PATH_PATTERN.matcher(path).matches()) { 485 return false; 486 } 487 488 try { 489 // Don't omit host otherwise leading path may be taken as host if it starts with // 490 final URI uri = new URI(null, "localhost", path, null); 491 final String norm = uri.normalize().getPath(); 492 if (norm.startsWith("/../") // Trying to go via the parent dir 493 || norm.equals("/..")) { // Trying to go to the parent dir 494 return false; 495 } 496 } catch (final URISyntaxException e) { 497 return false; 498 } 499 500 final int slash2Count = countToken("//", path); 501 if (isOff(ALLOW_2_SLASHES) && slash2Count > 0) { 502 return false; 503 } 504 505 return true; 506 } 507 508 /** 509 * Returns true if the query is null, or it's a properly formatted query string. 510 * 511 * @param query Query value to validate. 512 * @return true if query is valid. 513 */ 514 protected boolean isValidQuery(final String query) { 515 if (query == null) { 516 return true; 517 } 518 return QUERY_PATTERN.matcher(query).matches(); 519 } 520 521 /** 522 * Validate scheme. If schemes[] was initialized to a non-null, 523 * then only those schemes are allowed. 524 * Otherwise, the default schemes are "http", "https", "ftp". 525 * Matching is case-blind. 526 * 527 * @param scheme The scheme to validate. A {@code null} value is considered 528 * invalid. 529 * @return true if valid. 530 */ 531 protected boolean isValidScheme(final String scheme) { 532 if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches() 533 || isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) { 534 return false; 535 } 536 537 return true; 538 } 539 540}