View Javadoc
1   /*
2    *  Licensed to the Apache Software Foundation (ASF) under one or more
3    *  contributor license agreements.  See the NOTICE file distributed with
4    *  this work for additional information regarding copyright ownership.
5    *  The ASF licenses this file to You under the Apache License, Version 2.0
6    *  (the "License"); you may not use this file except in compliance with
7    *  the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  package org.apache.river.api.net;
18  
19  import java.io.File;
20  import java.net.URISyntaxException;
21  import java.util.StringTokenizer;
22  import org.apache.river.impl.Messages;
23  
24  /**
25   * UriParser for parsing RFC3986 compliant URI strings.
26   * @since 3.0.0
27   */
28  final class UriParser {
29      
30      private static final boolean isFileCaseInsensitiveOS = File.separatorChar == '\\';
31      
32      String string;
33      String scheme;
34      String schemespecificpart;
35      String authority;
36      String userinfo;
37      String host;
38      int port = -1;
39      String path;
40      String query;
41      String fragment;
42      boolean opaque;
43      boolean absolute;
44      boolean serverAuthority = false;
45      int hash = -1;
46      boolean fileSchemeCaseInsensitiveOS;
47  
48      void parseURI(String uri, boolean forceServer) throws URISyntaxException {
49          char fSlash = '/';
50          boolean fileURI;
51          StringBuilder temp = new StringBuilder(uri);
52          // assign uri string to the input value per spec
53          string = uri;
54          int index;
55          int index1;
56          int index2;
57          int index3;
58          // parse into Fragment, Scheme, and SchemeSpecificPart
59          // then parse SchemeSpecificPart if necessary
60          // Fragment
61          index = temp.indexOf("#");
62          if (index != -1) {
63              // remove the fragment from the end
64              fragment = temp.substring(index + 1);
65              validateFragment(uri, fragment, index + 1);
66              //                temp = temp.substring(0, index);
67              temp.delete(index, temp.length());
68          }
69          // Scheme and SchemeSpecificPart
70          index = index1 = temp.indexOf(":");
71          index2 = temp.indexOf("/");
72          index3 = temp.indexOf("?");
73          // if a '/' or '?' occurs before the first ':' the uri has no
74          // specified scheme, and is therefore not absolute
75          if (index != -1 && (index2 >= index || index2 == -1) && (index3 >= index || index3 == -1)) {
76              // the characters up to the first ':' comprise the scheme
77              absolute = true;
78              scheme = Uri.toAsciiLowerCase(temp.substring(0, index));
79              if (scheme.length() == 0) {
80                  throw new URISyntaxException(uri, Messages.getString("luni.83"), index);
81              }
82              validateScheme(uri, scheme, 0);
83              fileURI = (scheme.equalsIgnoreCase("file"));
84              fileSchemeCaseInsensitiveOS = (fileURI && isFileCaseInsensitiveOS);
85              schemespecificpart = temp.substring(index + 1);
86              if (schemespecificpart.length() == 0) {
87                  throw new URISyntaxException(uri, Messages.getString("luni.84"), index + 1);
88              }
89          } else {
90              absolute = false;
91              schemespecificpart = temp.toString();
92          }
93          if (scheme == null || schemespecificpart.length() > 0 && schemespecificpart.charAt(0) == fSlash) {
94              opaque = false;
95              // the URI is hierarchical
96              // Query
97              temp.delete(0, temp.length());
98              temp.append(schemespecificpart);
99              index = temp.indexOf("?");
100             if (index != -1) {
101                 query = temp.substring(index + 1);
102                 temp.delete(index, temp.length());
103                 validateQuery(uri, query, index2 + 1 + index);
104                 /**
105                  * The following line of code was incorrect and caused 6 test failures.
106                  * According to RFC 3986, Pages 40 and 41:
107                  * 
108                  * For example,
109                  * because the "http" scheme makes use of an authority component, has a
110                  * default port of "80", and defines an empty path to be equivalent to
111                  * "/", the following four URIs are equivalent:
112                  * 
113                  *    http://example.com
114                  *    http://example.com/
115                  *    http://example.com:/
116                  *    http://example.com:80/
117                  * 
118                  * Normalization should not remove delimiters when their associated
119                  * component is empty unless licensed to do so by the scheme 
120                  * specification.
121                  * 
122                  * For example, the URI "http://example.com/?" cannot be
123                  * assumed to be equivalent to any of the examples above.
124                  */
125 //                if ("".equals(query)) query = null; //This line causes ? to be removed.
126             }
127             // Authority and Path
128             if (temp.length() >= 2 && temp.charAt(0) == fSlash && temp.charAt(1) == fSlash) {
129                 //$NON-NLS-1$
130                 index = temp.indexOf("/", 2);
131                 if (index != -1) {
132                     authority = temp.substring(2, index);
133                     path = temp.substring(index);// begins with "/"
134                     //  path        = path-abempty    ; begins with "/" or is empty
135                     String [] segment = path.split("/");
136                     int l = segment.length;
137                     int index4 = 1;
138                     for (int i = 0; i < l ; i++){
139                         validateSegment(uri, segment[i], index1 + index + index4, Uri.segmentLegal);
140                         index4 += (segment[i].length() + 1);
141                     }
142                 } else {
143                     authority = temp.substring(2);
144                     if (authority.length() == 0 && query == null && fragment == null) {
145                         throw new URISyntaxException(uri, Messages.getString("luni.9F"), uri.length()); //$NON-NLS-1$
146                     }
147                     path = ""; //$NON-NLS-1$
148                     // nothing left, so path is empty (not null, path should
149                     // never be null)
150                 }
151                 if (authority.length() == 0) {
152                     authority = null;
153                 } 
154                 // Authority validated by userinfo, host and port, later.
155             } else {
156                 // no authority specified
157                 String legal;
158                 int index4 = 0;
159                 if (scheme == null){
160                     // path-noscheme   ; begins with a non-colon segment
161                     // path-noscheme = segment-nz-nc *( "/" segment )
162                     legal = Uri.segmentNzNcLegal;
163                 } else {
164                     legal = Uri.segmentLegal;
165                     index4 = index;
166                     // increment index4 if starts with slash.
167                     //if (temp.charAt(0) == fSlash) index4 ++;
168                 }
169                 path = temp.toString();
170                 String [] segment = path.split("/");
171                 int l = segment.length;
172                 for (int i = 0; i < l ; i++){
173                     // in case scheme == null only first segment is segment-nz-nc
174                     if (i == 1) legal = Uri.segmentLegal;
175                     validateSegment(uri, segment[i], index4, legal);
176                     index4 += (segment[i].length() + 1);
177                 }
178             }
179         } else {
180             // if not hierarchical, URI is opaque
181             opaque = true;
182             validateSsp(uri, schemespecificpart, index2 + 2 + index);
183         }
184         parseAuthority(forceServer);
185         // Normalise path and replace string.
186         if (!opaque){
187             StringBuilder result = new StringBuilder();
188             String normalizedPath = normalize(path);
189             if (scheme != null) {
190                 result.append(scheme);
191                 result.append(':');
192             }
193             
194             schemespecificpart = setSchemeSpecificPart(authority, normalizedPath , query);
195             
196             if (authority != null) {
197                 result.append("//"); //$NON-NLS-1$
198                 result.append(authority);
199             }
200 
201             if (path != null) {
202                 result.append(normalizedPath);
203             }
204 
205             if (query != null) {
206                 result.append('?');
207                 result.append(query);
208             }
209 
210             if (fragment != null) {
211                 result.append('#');
212                 result.append(fragment);
213             }
214 
215             this.string = result.toString();
216         }      
217     }
218 
219     private void validateScheme(String uri, String scheme, int index) throws URISyntaxException {
220         // first char needs to be an alpha char
221         char ch = scheme.charAt(0);
222         if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'))) {
223             throw new URISyntaxException(uri, Messages.getString("luni.85"), 0); //$NON-NLS-1$
224         }
225         try {
226             URIEncoderDecoder.validateSimple(scheme, "+-."); //$NON-NLS-1$
227         } catch (URISyntaxException e) {
228             throw new URISyntaxException(uri, Messages.getString("luni.85"), index + e.getIndex());
229         }
230     }
231 
232     private void validateSsp(String uri, String ssp, int index) throws URISyntaxException {
233         try {
234             URIEncoderDecoder.validate(ssp, Uri.allLegalUnescaped);
235         } catch (URISyntaxException e) {
236             throw new URISyntaxException(uri, Messages.getString("luni.86", e.getReason()), index + e.getIndex());
237         }
238     }
239 
240     private void validateSegment(String uri, String segment, int index, String legal) throws URISyntaxException {
241         try {
242             URIEncoderDecoder.validate(segment, legal); //$NON-NLS-1$
243         } catch (URISyntaxException e) {
244             throw new URISyntaxException(uri, Messages.getString("luni.88", e.getReason()), index + e.getIndex());
245         }
246     }
247     private void validateQuery(String uri, String query, int index) throws URISyntaxException {
248         try {
249             URIEncoderDecoder.validate(query, Uri.queryFragLegal);
250         } catch (URISyntaxException e) {
251             throw new URISyntaxException(uri, Messages.getString("luni.89", e.getReason()), index + e.getIndex());
252         }
253     }
254 
255     private void validateFragment(String uri, String fragment, int index) throws URISyntaxException {
256         try {
257             URIEncoderDecoder.validate(fragment, Uri.queryFragLegal);
258         } catch (URISyntaxException e) {
259             throw new URISyntaxException(uri, Messages.getString("luni.8A", e.getReason()), index + e.getIndex());
260         }
261     }
262 
263     /**
264      * determine the host, port and userinfo if the authority parses
265      * successfully to a server based authority
266      *
267      * behaviour in error cases: if forceServer is true, throw
268      * URISyntaxException with the proper diagnostic messages. if
269      * forceServer is false assume this is a registry based uri, and just
270      * return leaving the host, port and userinfo fields undefined.
271      *
272      * and there are some error cases where URISyntaxException is thrown
273      * regardless of the forceServer parameter e.g. malformed ipv6 address
274      */
275     void parseAuthority(boolean forceServer) throws URISyntaxException {
276         if (authority == null) {
277             return;
278         }
279         String temp;
280         String tempUserinfo = null;
281         String tempHost;
282         int index;
283         int hostindex = 0;
284         int tempPort = -1;
285         temp = authority;
286         index = temp.indexOf('@');
287         if (index != -1) {
288             // remove user info
289             tempUserinfo = temp.substring(0, index);
290             validateUserinfo(authority, tempUserinfo, 0);
291             temp = temp.substring(index + 1); // host[:port] is left
292             hostindex = index + 1;
293         }
294         index = temp.lastIndexOf(':');
295         int endindex = temp.indexOf(']');
296         if (index != -1 && endindex < index) {
297             // determine port and host
298             tempHost = temp.substring(0, index);
299             if (index < (temp.length() - 1)) {
300                 // port part is not empty
301                 try {
302                     tempPort = Integer.parseInt(temp.substring(index + 1));
303                     if (tempPort < 0) {
304                         if (forceServer) {
305                             throw new URISyntaxException(authority, Messages.getString("luni.8B"), hostindex + index + 1); //$NON-NLS-1$
306                         }
307                         return;
308                     }
309                 } catch (NumberFormatException e) {
310                     if (forceServer) {
311                         throw new URISyntaxException(authority, Messages.getString("luni.8B"), hostindex + index + 1); //$NON-NLS-1$
312                     }
313                     return;
314                 }
315             }
316         } else {
317             tempHost = temp;
318         }
319         if (tempHost.equals("")) {
320             //$NON-NLS-1$
321             if (forceServer) {
322                 throw new URISyntaxException(authority, Messages.getString("luni.A0"), hostindex); //$NON-NLS-1$
323             }
324             return;
325         }
326         if (!isValidHost(forceServer, tempHost)) {
327             return;
328         }
329         // this is a server based uri,
330         // fill in the userinfo, host and port fields
331         userinfo = tempUserinfo;
332         host = tempHost;
333         port = tempPort;
334         serverAuthority = true;
335     }
336 
337     private void validateUserinfo(String uri, String userinfo, int index) throws URISyntaxException {
338         try {
339             URIEncoderDecoder.validate(userinfo, Uri.userinfoLegal); //$NON-NLS-1$
340         } catch (URISyntaxException e) {
341             throw new URISyntaxException(uri, Messages.getString("luni.8C", e.getReason()), index + e.getIndex());
342         }
343     }
344 
345     /**
346      * distinguish between IPv4, IPv6, domain name and validate it based on
347      * its type
348      */
349     private boolean isValidHost(boolean forceServer, String host) throws URISyntaxException {
350         if (host.charAt(0) == '[') {
351             // ipv6 address or IPvFuture
352             if (host.charAt(host.length() - 1) != ']') {
353                 throw new URISyntaxException(host, Messages.getString("luni.8D"), 0); //$NON-NLS-1$
354             }
355             // check for valid IPvFuture syntax.
356             if (isValidIPvFutureAddress(host)) return true;
357             if (!isValidIP6Address(host)) {
358                 throw new URISyntaxException(host, Messages.getString("luni.8E")); //$NON-NLS-1$
359             }
360             return true;
361         }
362         // '[' and ']' can only be the first char and last char
363         // of the host name
364         if (host.indexOf('[') != -1 || host.indexOf(']') != -1) {
365             throw new URISyntaxException(host, Messages.getString("luni.8F"), 0); //$NON-NLS-1$
366         }
367         int index = host.lastIndexOf('.');
368         if (index < 0 || index == host.length() - 1 || !Character.isDigit(host.charAt(index + 1))) {
369             // domain name
370             if (isValidDomainName(host)) {
371                 return true;
372             }
373             if (forceServer) {
374                 throw new URISyntaxException(host, Messages.getString("luni.8F"), 0); //$NON-NLS-1$
375             }
376             return false;
377         }
378         // IPv4 address
379         if (isValidIPv4Address(host)) {
380             return true;
381         }
382         if (forceServer) {
383             throw new URISyntaxException(host, Messages.getString("luni.90"), 0); //$NON-NLS-1$
384         }
385         return false;
386     }
387 
388     private boolean isValidDomainName(String host) throws URISyntaxException {
389         URIEncoderDecoder.validate(host, Uri.hostRegNameLegal); //$NON-NLS-1$
390         String label = null;
391         StringTokenizer st = new StringTokenizer(host, "."); //$NON-NLS-1$
392         while (st.hasMoreTokens()) {
393             label = st.nextToken();
394             if (label.startsWith("-") || label.endsWith("-")) {
395                 //$NON-NLS-1$ //$NON-NLS-2$
396                 return false;
397             }
398         }
399         if ( label != null && !label.equals(host)) {
400             char ch = label.charAt(0);
401             if (ch >= '0' && ch <= '9') {
402                 return false;
403             }
404         }
405         return true;
406     }
407 
408     private boolean isValidIPv4Address(String host) {
409         int index;
410         int index2;
411         try {
412             int num;
413             index = host.indexOf('.');
414             num = Integer.parseInt(host.substring(0, index));
415             if (num < 0 || num > 255) {
416                 return false;
417             }
418             index2 = host.indexOf('.', index + 1);
419             num = Integer.parseInt(host.substring(index + 1, index2));
420             if (num < 0 || num > 255) {
421                 return false;
422             }
423             index = host.indexOf('.', index2 + 1);
424             num = Integer.parseInt(host.substring(index2 + 1, index));
425             if (num < 0 || num > 255) {
426                 return false;
427             }
428             num = Integer.parseInt(host.substring(index + 1));
429             if (num < 0 || num > 255) {
430                 return false;
431             }
432         } catch (NumberFormatException e) {
433             return false;
434         }
435         return true;
436     }
437 
438     private boolean isValidIP6Address(String ipAddress) {
439         int length = ipAddress.length();
440         boolean doubleColon = false;
441         int numberOfColons = 0;
442         int numberOfPeriods = 0;
443         String word = ""; //$NON-NLS-1$
444         char c = 0;
445         char prevChar;
446         int offset = 0; // offset for [] ip addresses
447         if (length < 2) {
448             return false;
449         }
450         for (int i = 0; i < length; i++) {
451             prevChar = c;
452             c = ipAddress.charAt(i);
453             switch (c) {
454             // case for an open bracket [x:x:x:...x]
455                 case '[':
456                     if (i != 0) {
457                         return false; // must be first character
458                     }
459                     if (ipAddress.charAt(length - 1) != ']') {
460                         return false; // must have a close ]
461                     }
462                     if ((ipAddress.charAt(1) == ':') && (ipAddress.charAt(2) != ':')) {
463                         return false;
464                     }
465                     offset = 1;
466                     if (length < 4) {
467                         return false;
468                     }
469                     break;
470             // case for a closed bracket at end of IP [x:x:x:...x]
471                 case ']':
472                     if (i != length - 1) {
473                         return false; // must be last character
474                     }
475                     if (ipAddress.charAt(0) != '[') {
476                         return false; // must have a open [
477                     }
478                     break;
479             // case for the last 32-bits represented as IPv4
480             // x:x:x:x:x:x:d.d.d.d
481                 case '.':
482                     numberOfPeriods++;
483                     if (numberOfPeriods > 3) {
484                         return false;
485                     }
486                     if (!isValidIP4Word(word)) {
487                         return false;
488                     }
489                     if (numberOfColons != 6 && !doubleColon) {
490                         return false;
491                     }
492                     // a special case ::1:2:3:4:5:d.d.d.d allows 7 colons
493                     // with
494                     // an IPv4 ending, otherwise 7 :'s is bad
495                     if (numberOfColons == 7 && ipAddress.charAt(0 + offset) != ':' && ipAddress.charAt(1 + offset) != ':') {
496                         return false;
497                     }
498                     word = ""; //$NON-NLS-1$
499                     break;
500                 case ':':
501                     numberOfColons++;
502                     if (numberOfColons > 7) {
503                         return false;
504                     }
505                     if (numberOfPeriods > 0) {
506                         return false;
507                     }
508                     if (prevChar == ':') {
509                         if (doubleColon) {
510                             return false;
511                         }
512                         doubleColon = true;
513                     }
514                     word = ""; //$NON-NLS-1$
515                     break;
516                 default:
517                     if (word.length() > 3) {
518                         return false;
519                     }
520                     if (!isValidHexChar(c)) {
521                         return false;
522                     }
523                     word += c;
524             }
525         }
526         // Check if we have an IPv4 ending
527         if (numberOfPeriods > 0) {
528             if (numberOfPeriods != 3 || !isValidIP4Word(word)) {
529                 return false;
530             }
531         } else {
532             // If we're at then end and we haven't had 7 colons then there
533             // is a problem unless we encountered a doubleColon
534             if (numberOfColons != 7 && !doubleColon) {
535                 return false;
536             }
537             // If we have an empty word at the end, it means we ended in
538             // either a : or a .
539             // If we did not end in :: then this is invalid
540             if (word.equals("") && ipAddress.charAt(length - 1 - offset) != ':' && ipAddress.charAt(length - 2 - offset) != ':') {
541                 return false;
542             }
543         }
544         return true;
545     }
546 
547     private boolean isValidIP4Word(String word) {
548         char c;
549         if (word.length() < 1 || word.length() > 3) return false;
550         for (int i = 0; i < word.length(); i++) {
551             c = word.charAt(i);
552             if (!(c >= '0' && c <= '9')) return false;
553         }
554         return (Integer.parseInt(word) <= 255);
555     }
556 
557     private boolean isValidHexChar(char c) {
558         return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
559     }
560 
561     private boolean isValidIPvFutureAddress(String ipvFuture) throws URISyntaxException {
562         // [ at index 0 has been checked.
563         if (ipvFuture.charAt(1) != 'v') return false;
564         if (!isValidHexChar(ipvFuture.charAt(2))) return false;
565         if (ipvFuture.charAt(3) != '.') return false;
566         String sub = ipvFuture.substring(4, ipvFuture.length()-1);
567         URIEncoderDecoder.validate(sub, Uri.iPvFuture);
568         return true;
569     }
570     
571     /*
572      * normalize path, and return the resulting string
573      */
574     private String normalize(String path) {
575         // count the number of '/'s, to determine number of segments
576         int index = -1;
577         int pathlen = path.length();
578         int size = 0;
579         if (pathlen > 0 && path.charAt(0) != '/') {
580             size++;
581         }
582         while ((index = path.indexOf('/', index + 1)) != -1) {
583             if (index + 1 < pathlen && path.charAt(index + 1) != '/') {
584                 size++;
585             }
586         }
587 
588         String[] seglist = new String[size];
589         boolean[] include = new boolean[size];
590 
591         // break the path into segments and store in the list
592         int current = 0;
593         int index2;
594         index = (pathlen > 0 && path.charAt(0) == '/') ? 1 : 0;
595         while ((index2 = path.indexOf('/', index + 1)) != -1) {
596             seglist[current++] = path.substring(index, index2);
597             index = index2 + 1;
598         }
599 
600         // if current==size, then the last character was a slash
601         // and there are no more segments
602         if (current < size) {
603             seglist[current] = path.substring(index);
604         }
605 
606         // determine which segments get included in the normalized path
607         for (int i = 0; i < size; i++) {
608             include[i] = true;
609             if (seglist[i].equals("..")) { //$NON-NLS-1$
610                 int remove = i - 1;
611                 // search back to find a segment to remove, if possible
612                 while (remove > -1 && !include[remove]) {
613                     remove--;
614                 }
615                 // if we find a segment to remove, remove it and the ".."
616                 // segment
617                 if (remove > -1 && !seglist[remove].equals("..")) { //$NON-NLS-1$
618                     include[remove] = false;
619                     include[i] = false;
620                 }
621             } else if (seglist[i].equals(".")) { //$NON-NLS-1$
622                 include[i] = false;
623             }
624         }
625 
626         // put the path back together
627         StringBuilder newpath = new StringBuilder();
628         if (path.startsWith("/")) { //$NON-NLS-1$
629             newpath.append('/');
630         }
631 
632         for (int i = 0; i < seglist.length; i++) {
633             if (include[i]) {
634                 newpath.append(seglist[i]);
635                 newpath.append('/');
636             }
637         }
638 
639         // if we used at least one segment and the path previously ended with
640         // a slash and the last segment is still used, then delete the extra
641         // trailing '/'
642         if (!path.endsWith("/") && seglist.length > 0 //$NON-NLS-1$
643                 && include[seglist.length - 1]) {
644             newpath.deleteCharAt(newpath.length() - 1);
645         }
646 
647         String result = newpath.toString();
648 
649         // check for a ':' in the first segment if one exists,
650         // prepend "./" to normalize
651         index = result.indexOf(':');
652         index2 = result.indexOf('/');
653         if (index != -1 && (index < index2 || index2 == -1)) {
654             newpath.insert(0, "./"); //$NON-NLS-1$
655             result = newpath.toString();
656         }
657         return result;
658     }
659     
660     /**
661      * UriParser method used to re-calculate the scheme specific part of the
662      * resolved or normalized URIs
663      */
664     private String setSchemeSpecificPart(String authority,
665                                          String path,
666                                          String query) {
667         // ssp = [//authority][path][?query]
668         StringBuilder ssp = new StringBuilder();
669         if (authority != null) {
670             ssp.append("//"); //$NON-NLS-1$
671             ssp.append(authority);
672         }
673         if (path != null) {
674             ssp.append(path);
675         }
676         if (query != null) {
677             ssp.append("?"); //$NON-NLS-1$
678             ssp.append(query);
679         }
680         // reset string, so that it can be re-calculated correctly when asked.
681         return ssp.toString();
682     }
683 }