001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.validator; 018 019import java.io.Serializable; 020import java.util.Arrays; 021import java.util.HashSet; 022import java.util.Set; 023import java.util.regex.Matcher; 024import java.util.regex.Pattern; 025 026import org.apache.commons.validator.routines.InetAddressValidator; 027import org.apache.commons.validator.util.Flags; 028 029/** 030 * <p>Validates URLs.</p> 031 * Behaviour of validation is modified by passing in options: 032 * <ul> 033 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 034 * component.</li> 035 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 036 * included then fragments are flagged as illegal.</li> 037 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 038 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 039 * </ul> 040 * 041 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 042 * https://javascript.internet.com. However, this validation now bears little resemblance 043 * to the php original.</p> 044 * <pre> 045 * Example of usage: 046 * Construct a UrlValidator with valid schemes of "http", and "https". 047 * 048 * String[] schemes = {"http","https"}. 049 * UrlValidator urlValidator = new UrlValidator(schemes); 050 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 051 * System.out.println("URL is valid"); 052 * } else { 053 * System.out.println("URL is invalid"); 054 * } 055 * 056 * prints "URL is invalid" 057 * If instead the default constructor is used. 058 * 059 * UrlValidator urlValidator = new UrlValidator(); 060 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 061 * System.out.println("URL is valid"); 062 * } else { 063 * System.out.println("URL is invalid"); 064 * } 065 * 066 * prints out "URL is valid" 067 * </pre> 068 * 069 * @see 070 * <a href="https://www.ietf.org/rfc/rfc2396.txt"> 071 * Uniform Resource Identifiers (URI): Generic Syntax 072 * </a> 073 * 074 * @since 1.1 075 * @deprecated Use the new UrlValidator in the routines package. This class 076 * will be removed in a future release. 077 */ 078@Deprecated 079public class UrlValidator implements Serializable { 080 081 private static final int TOP_LEVEL_MAX_LEN = 4; 082 083 private static final int TOP_LEVEL_MIN_LEN = 2; 084 085 private static final long serialVersionUID = 24137157400029593L; 086 087 /** 088 * Allows all validly formatted schemes to pass validation instead of 089 * supplying a set of valid schemes. 090 */ 091 public static final int ALLOW_ALL_SCHEMES = 1 << 0; 092 093 /** 094 * Allow two slashes in the path component of the URL. 095 */ 096 public static final int ALLOW_2_SLASHES = 1 << 1; 097 098 /** 099 * Enabling this options disallows any URL fragments. 100 */ 101 public static final int NO_FRAGMENTS = 1 << 2; 102 103 private static final String ALPHA_CHARS = "a-zA-Z"; 104 105// NOT USED private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d"; 106 107 private static final String SPECIAL_CHARS = ";/@&=,.?:+$"; 108 109 private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]"; 110 111 // Drop numeric, and "+-." for now 112 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; 113 114 private static final String ATOM = VALID_CHARS + '+'; 115 116 /** 117 * This expression derived/taken from the BNF for URI (RFC2396). 118 */ 119 private static final String URL_REGEX = 120 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; 121 // 12 3 4 5 6 7 8 9 122 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX); 123 124 /** 125 * Schema/Protocol (ie. http:, ftp:, file:, etc). 126 */ 127 private static final int PARSE_URL_SCHEME = 2; 128 129 /** 130 * Includes hostname/ip and port number. 131 */ 132 private static final int PARSE_URL_AUTHORITY = 4; 133 134 private static final int PARSE_URL_PATH = 5; 135 136 private static final int PARSE_URL_QUERY = 7; 137 138 private static final int PARSE_URL_FRAGMENT = 9; 139 140 /** 141 * Protocol (for example, http:, ftp:, https:). 142 */ 143 private static final Pattern SCHEME_PATTERN = Pattern.compile("^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"); 144 145 private static final String AUTHORITY_REGEX = 146 "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?"; 147 // 1 2 3 4 148 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 149 150 private static final int PARSE_AUTHORITY_HOST_IP = 1; 151 152 private static final int PARSE_AUTHORITY_PORT = 2; 153 154 /** 155 * Should always be empty. 156 */ 157 private static final int PARSE_AUTHORITY_EXTRA = 3; 158 159 private static final Pattern PATH_PATTERN = Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$"); 160 161 private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$"); 162 163 private static final Pattern LEGAL_ASCII_PATTERN = Pattern.compile("^\\p{ASCII}+$"); 164 165 private static final Pattern DOMAIN_PATTERN = 166 Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$"); 167 168 private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$"); 169 170 private static final Pattern ATOM_PATTERN = Pattern.compile("^(" + ATOM + ").*?$"); 171 172 private static final Pattern ALPHA_PATTERN = Pattern.compile("^[" + ALPHA_CHARS + "]"); 173 174 /** 175 * Holds the set of current validation options. 176 */ 177 private final Flags options; 178 179 /** 180 * The set of schemes that are allowed to be in a URL. 181 */ 182 private final Set<String> allowedSchemes = new HashSet<>(); 183 184 /** 185 * If no schemes are provided, default to this set. 186 */ 187 protected String[] defaultSchemes = {"http", "https", "ftp"}; 188 189 /** 190 * Create a UrlValidator with default properties. 191 */ 192 public UrlValidator() { 193 this(null); 194 } 195 196 /** 197 * Initialize a UrlValidator with the given validation options. 198 * 199 * @param options The options should be set using the public constants declared in 200 * this class. To set multiple options you simply add them together. For example, 201 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 202 */ 203 public UrlValidator(final int options) { 204 this(null, options); 205 } 206 207 /** 208 * Behavior of validation is modified by passing in several strings options: 209 * 210 * @param schemes Pass in one or more URL schemes to consider valid, passing in 211 * a null will default to "http,https,ftp" being valid. 212 * If a non-null schemes is specified then all valid schemes must 213 * be specified. Setting the ALLOW_ALL_SCHEMES option will 214 * ignore the contents of schemes. 215 */ 216 public UrlValidator(final String[] schemes) { 217 this(schemes, 0); 218 } 219 220 /** 221 * Behaviour of validation is modified by passing in options: 222 * 223 * @param schemes The set of valid schemes. 224 * @param options The options should be set using the public constants declared in 225 * this class. To set multiple options you simply add them together. For example, 226 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 227 */ 228 public UrlValidator(String[] schemes, final int options) { 229 this.options = new Flags(options); 230 231 if (this.options.isOn(ALLOW_ALL_SCHEMES)) { 232 return; 233 } 234 235 if (schemes == null) { 236 schemes = defaultSchemes; 237 } 238 239 allowedSchemes.addAll(Arrays.asList(schemes)); 240 } 241 242 /** 243 * Returns the number of times the token appears in the target. 244 * 245 * @param token Token value to be counted. 246 * @param target Target value to count tokens in. 247 * @return the number of tokens. 248 */ 249 protected int countToken(final String token, final String target) { 250 int tokenIndex = 0; 251 int count = 0; 252 while (tokenIndex != -1) { 253 tokenIndex = target.indexOf(token, tokenIndex); 254 if (tokenIndex > -1) { 255 tokenIndex++; 256 count++; 257 } 258 } 259 return count; 260 } 261 262 /** 263 * <p>Checks if a field has a valid URL address.</p> 264 * 265 * @param value The value validation is being performed on. A {@code null} 266 * value is considered invalid. 267 * @return true if the URL is valid. 268 */ 269 public boolean isValid(final String value) { 270 if (value == null || !LEGAL_ASCII_PATTERN.matcher(value).matches()) { 271 return false; 272 } 273 274 // Check the whole url address structure 275 final Matcher urlMatcher = URL_PATTERN.matcher(value); 276 if (!urlMatcher.matches() || !isValidScheme(urlMatcher.group(PARSE_URL_SCHEME)) || !isValidAuthority(urlMatcher.group(PARSE_URL_AUTHORITY)) || !isValidPath(urlMatcher.group(PARSE_URL_PATH))) { 277 return false; 278 } 279 280 if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) { 281 return false; 282 } 283 284 if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) { 285 return false; 286 } 287 288 return true; 289 } 290 291 /** 292 * Returns true if the authority is properly formatted. An authority is the combination 293 * of hostname and port. A {@code null} authority value is considered invalid. 294 * 295 * @param authority Authority value to validate. 296 * @return true if authority (hostname and port) is valid. 297 */ 298 protected boolean isValidAuthority(final String authority) { 299 if (authority == null) { 300 return false; 301 } 302 303 final InetAddressValidator inetAddressValidator = 304 InetAddressValidator.getInstance(); 305 306 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority); 307 if (!authorityMatcher.matches()) { 308 return false; 309 } 310 311 boolean hostname = false; 312 // check if authority is IP address or hostname 313 String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 314 final boolean ipV4Address = inetAddressValidator.isValid(hostIP); 315 316 if (!ipV4Address) { 317 // Domain is hostname name 318 hostname = DOMAIN_PATTERN.matcher(hostIP).matches(); 319 } 320 321 //rightmost hostname will never start with a digit. 322 if (hostname) { 323 // LOW-TECH FIX FOR VALIDATOR-202 324 // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203 325 final char[] chars = hostIP.toCharArray(); 326 int size = 1; 327 for (final char element : chars) { 328 if (element == '.') { 329 size++; 330 } 331 } 332 final String[] domainSegment = new String[size]; 333 boolean match = true; 334 int segmentCount = 0; 335 int segmentLength = 0; 336 337 while (match) { 338 final Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP); 339 match = atomMatcher.matches(); 340 if (match) { 341 domainSegment[segmentCount] = atomMatcher.group(1); 342 segmentLength = domainSegment[segmentCount].length() + 1; 343 hostIP = 344 segmentLength >= hostIP.length() 345 ? "" 346 : hostIP.substring(segmentLength); 347 348 segmentCount++; 349 } 350 } 351 final String topLevel = domainSegment[segmentCount - 1]; 352 353 354 // First letter of top level must be an alpha 355 // Make sure there's a host name preceding the authority. 356 if (topLevel.length() < TOP_LEVEL_MIN_LEN || topLevel.length() > TOP_LEVEL_MAX_LEN || !ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches() 357 || segmentCount < 2) { 358 return false; 359 } 360 } 361 362 if (!hostname && !ipV4Address) { 363 return false; 364 } 365 366 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 367 if (port != null && !PORT_PATTERN.matcher(port).matches()) { 368 return false; 369 } 370 371 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 372 if (!GenericValidator.isBlankOrNull(extra)) { 373 return false; 374 } 375 376 return true; 377 } 378 379 /** 380 * Returns true if the given fragment is null or fragments are allowed. 381 * 382 * @param fragment Fragment value to validate. 383 * @return true if fragment is valid. 384 */ 385 protected boolean isValidFragment(final String fragment) { 386 if (fragment == null) { 387 return true; 388 } 389 390 return options.isOff(NO_FRAGMENTS); 391 } 392 393 /** 394 * Returns true if the path is valid. A {@code null} value is considered invalid. 395 * 396 * @param path Path value to validate. 397 * @return true if path is valid. 398 */ 399 protected boolean isValidPath(final String path) { 400 if (path == null || !PATH_PATTERN.matcher(path).matches()) { 401 return false; 402 } 403 404 final int slash2Count = countToken("//", path); 405 if (options.isOff(ALLOW_2_SLASHES) && slash2Count > 0) { 406 return false; 407 } 408 409 final int slashCount = countToken("/", path); 410 final int dot2Count = countToken("..", path); 411 if (dot2Count > 0 && slashCount - slash2Count - 1 <= dot2Count) { 412 return false; 413 } 414 415 return true; 416 } 417 418 /** 419 * Returns true if the query is null, or it's a properly formatted query string. 420 * 421 * @param query Query value to validate. 422 * @return true if query is valid. 423 */ 424 protected boolean isValidQuery(final String query) { 425 if (query == null) { 426 return true; 427 } 428 429 return QUERY_PATTERN.matcher(query).matches(); 430 } 431 432 /** 433 * Validate scheme. If schemes[] was initialized to a non-null, 434 * then only those schemes are allowed. Note this is slightly different 435 * than for the constructor. 436 * 437 * @param scheme The scheme to validate. A {@code null} value is considered 438 * invalid. 439 * @return true if valid. 440 */ 441 protected boolean isValidScheme(final String scheme) { 442 if (scheme == null || !SCHEME_PATTERN.matcher(scheme).matches() || options.isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme)) { 443 return false; 444 } 445 446 return true; 447 } 448}