ReactOS  0.4.13-dev-563-g0561610
uri.c
Go to the documentation of this file.
1 /*
2  * Copyright 2010 Jacek Caban for CodeWeavers
3  * Copyright 2010 Thomas Mullaly
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public
7  * License as published by the Free Software Foundation; either
8  * version 2.1 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  * Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public
16  * License along with this library; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
18  */
19 
20 #include <limits.h>
21 
22 #include "urlmon_main.h"
23 #include "wine/debug.h"
24 
25 #define NO_SHLWAPI_REG
26 #include "shlwapi.h"
27 
28 #include "strsafe.h"
29 
30 #define URI_DISPLAY_NO_ABSOLUTE_URI 0x1
31 #define URI_DISPLAY_NO_DEFAULT_PORT_AUTH 0x2
32 
33 #define ALLOW_NULL_TERM_SCHEME 0x01
34 #define ALLOW_NULL_TERM_USER_NAME 0x02
35 #define ALLOW_NULL_TERM_PASSWORD 0x04
36 #define ALLOW_BRACKETLESS_IP_LITERAL 0x08
37 #define SKIP_IP_FUTURE_CHECK 0x10
38 #define IGNORE_PORT_DELIMITER 0x20
39 
40 #define RAW_URI_FORCE_PORT_DISP 0x1
41 #define RAW_URI_CONVERT_TO_DOS_PATH 0x2
42 
43 #define COMBINE_URI_FORCE_FLAG_USE 0x1
44 
46 
47 static const IID IID_IUriObj = {0x4b364760,0x9f51,0x11df,{0x98,0x1c,0x08,0x00,0x20,0x0c,0x9a,0x66}};
48 
49 typedef struct {
54 
56 
58 
59  /* Information about the canonicalized URI's buffer. */
65 
69 
73 
76  Uri_HOST_TYPE host_type;
77 
81 
84 
86 
90 
93 
96 } Uri;
97 
98 typedef struct {
101 
104 
107 
110 
113 
116 
119 
122 
125 
128 } UriBuilder;
129 
130 typedef struct {
131  const WCHAR *str;
133 } h16;
134 
135 typedef struct {
136  /* IPv6 addresses can hold up to 8 h16 components. */
139 
140  /* An IPv6 can have 1 elision ("::"). */
141  const WCHAR *elision;
142 
143  /* An IPv6 can contain 1 IPv4 address as the last 32bits of the address. */
144  const WCHAR *ipv4;
146 
149 } ipv6_address;
150 
151 typedef struct {
153 
160 
161  const WCHAR *scheme;
164 
165  const WCHAR *username;
167 
168  const WCHAR *password;
170 
171  const WCHAR *host;
173  Uri_HOST_TYPE host_type;
174 
177 
179  const WCHAR *port;
182 
183  const WCHAR *path;
185 
186  const WCHAR *query;
188 
189  const WCHAR *fragment;
191 } parse_data;
192 
193 static const CHAR hexDigits[] = "0123456789ABCDEF";
194 
195 /* List of scheme types/scheme names that are recognized by the IUri interface as of IE 7. */
196 static const struct {
199 } recognized_schemes[] = {
200  {URL_SCHEME_FTP, {'f','t','p',0}},
201  {URL_SCHEME_HTTP, {'h','t','t','p',0}},
202  {URL_SCHEME_GOPHER, {'g','o','p','h','e','r',0}},
203  {URL_SCHEME_MAILTO, {'m','a','i','l','t','o',0}},
204  {URL_SCHEME_NEWS, {'n','e','w','s',0}},
205  {URL_SCHEME_NNTP, {'n','n','t','p',0}},
206  {URL_SCHEME_TELNET, {'t','e','l','n','e','t',0}},
207  {URL_SCHEME_WAIS, {'w','a','i','s',0}},
208  {URL_SCHEME_FILE, {'f','i','l','e',0}},
209  {URL_SCHEME_MK, {'m','k',0}},
210  {URL_SCHEME_HTTPS, {'h','t','t','p','s',0}},
211  {URL_SCHEME_SHELL, {'s','h','e','l','l',0}},
212  {URL_SCHEME_SNEWS, {'s','n','e','w','s',0}},
213  {URL_SCHEME_LOCAL, {'l','o','c','a','l',0}},
214  {URL_SCHEME_JAVASCRIPT, {'j','a','v','a','s','c','r','i','p','t',0}},
215  {URL_SCHEME_VBSCRIPT, {'v','b','s','c','r','i','p','t',0}},
216  {URL_SCHEME_ABOUT, {'a','b','o','u','t',0}},
217  {URL_SCHEME_RES, {'r','e','s',0}},
218  {URL_SCHEME_MSSHELLROOTED, {'m','s','-','s','h','e','l','l','-','r','o','o','t','e','d',0}},
219  {URL_SCHEME_MSSHELLIDLIST, {'m','s','-','s','h','e','l','l','-','i','d','l','i','s','t',0}},
220  {URL_SCHEME_MSHELP, {'h','c','p',0}},
221  {URL_SCHEME_WILDCARD, {'*',0}}
222 };
223 
224 /* List of default ports Windows recognizes. */
225 static const struct {
228 } default_ports[] = {
229  {URL_SCHEME_FTP, 21},
230  {URL_SCHEME_HTTP, 80},
231  {URL_SCHEME_GOPHER, 70},
232  {URL_SCHEME_NNTP, 119},
233  {URL_SCHEME_TELNET, 23},
234  {URL_SCHEME_WAIS, 210},
235  {URL_SCHEME_HTTPS, 443},
236 };
237 
238 /* List of 3-character top level domain names Windows seems to recognize.
239  * There might be more, but, these are the only ones I've found so far.
240  */
241 static const struct {
243 } recognized_tlds[] = {
244  {{'c','o','m',0}},
245  {{'e','d','u',0}},
246  {{'g','o','v',0}},
247  {{'i','n','t',0}},
248  {{'m','i','l',0}},
249  {{'n','e','t',0}},
250  {{'o','r','g',0}}
251 };
252 
254 {
255  Uri *ret;
256  HRESULT hres;
257 
258  hres = IUri_QueryInterface(uri, &IID_IUriObj, (void**)&ret);
259  return SUCCEEDED(hres) ? ret : NULL;
260 }
261 
262 static inline BOOL is_alpha(WCHAR val) {
263  return ((val >= 'a' && val <= 'z') || (val >= 'A' && val <= 'Z'));
264 }
265 
266 static inline BOOL is_num(WCHAR val) {
267  return (val >= '0' && val <= '9');
268 }
269 
270 static inline BOOL is_drive_path(const WCHAR *str) {
271  return (is_alpha(str[0]) && (str[1] == ':' || str[1] == '|'));
272 }
273 
274 static inline BOOL is_unc_path(const WCHAR *str) {
275  return (str[0] == '\\' && str[1] == '\\');
276 }
277 
279  return (val == '>' || val == '<' || val == '\"');
280 }
281 
282 /* A URI is implicitly a file path if it begins with
283  * a drive letter (e.g. X:) or starts with "\\" (UNC path).
284  */
285 static inline BOOL is_implicit_file_path(const WCHAR *str) {
286  return (is_unc_path(str) || (is_alpha(str[0]) && str[1] == ':'));
287 }
288 
289 /* Checks if the URI is a hierarchical URI. A hierarchical
290  * URI is one that has "//" after the scheme.
291  */
292 static BOOL check_hierarchical(const WCHAR **ptr) {
293  const WCHAR *start = *ptr;
294 
295  if(**ptr != '/')
296  return FALSE;
297 
298  ++(*ptr);
299  if(**ptr != '/') {
300  *ptr = start;
301  return FALSE;
302  }
303 
304  ++(*ptr);
305  return TRUE;
306 }
307 
308 /* unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" */
309 static inline BOOL is_unreserved(WCHAR val) {
310  return (is_alpha(val) || is_num(val) || val == '-' || val == '.' ||
311  val == '_' || val == '~');
312 }
313 
314 /* sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
315  * / "*" / "+" / "," / ";" / "="
316  */
317 static inline BOOL is_subdelim(WCHAR val) {
318  return (val == '!' || val == '$' || val == '&' ||
319  val == '\'' || val == '(' || val == ')' ||
320  val == '*' || val == '+' || val == ',' ||
321  val == ';' || val == '=');
322 }
323 
324 /* gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" */
325 static inline BOOL is_gendelim(WCHAR val) {
326  return (val == ':' || val == '/' || val == '?' ||
327  val == '#' || val == '[' || val == ']' ||
328  val == '@');
329 }
330 
331 /* Characters that delimit the end of the authority
332  * section of a URI. Sometimes a '\\' is considered
333  * an authority delimiter.
334  */
335 static inline BOOL is_auth_delim(WCHAR val, BOOL acceptSlash) {
336  return (val == '#' || val == '/' || val == '?' ||
337  val == '\0' || (acceptSlash && val == '\\'));
338 }
339 
340 /* reserved = gen-delims / sub-delims */
341 static inline BOOL is_reserved(WCHAR val) {
342  return (is_subdelim(val) || is_gendelim(val));
343 }
344 
345 static inline BOOL is_hexdigit(WCHAR val) {
346  return ((val >= 'a' && val <= 'f') ||
347  (val >= 'A' && val <= 'F') ||
348  (val >= '0' && val <= '9'));
349 }
350 
352  return (!val || (val == '#' && scheme != URL_SCHEME_FILE) || val == '?');
353 }
354 
355 static inline BOOL is_slash(WCHAR c)
356 {
357  return c == '/' || c == '\\';
358 }
359 
360 static inline BOOL is_ascii(WCHAR c)
361 {
362  return c < 0x80;
363 }
364 
366  DWORD i;
367 
368  for(i = 0; i < ARRAY_SIZE(default_ports); ++i) {
370  return TRUE;
371  }
372 
373  return FALSE;
374 }
375 
376 /* List of schemes types Windows seems to expect to be hierarchical. */
378  return(type == URL_SCHEME_HTTP || type == URL_SCHEME_FTP ||
382  type == URL_SCHEME_RES);
383 }
384 
385 /* Checks if 'flags' contains an invalid combination of Uri_CREATE flags. */
387  return((flags & Uri_CREATE_DECODE_EXTRA_INFO && flags & Uri_CREATE_NO_DECODE_EXTRA_INFO) ||
388  (flags & Uri_CREATE_CANONICALIZE && flags & Uri_CREATE_NO_CANONICALIZE) ||
389  (flags & Uri_CREATE_CRACK_UNKNOWN_SCHEMES && flags & Uri_CREATE_NO_CRACK_UNKNOWN_SCHEMES) ||
390  (flags & Uri_CREATE_PRE_PROCESS_HTML_URI && flags & Uri_CREATE_NO_PRE_PROCESS_HTML_URI) ||
391  (flags & Uri_CREATE_IE_SETTINGS && flags & Uri_CREATE_NO_IE_SETTINGS));
392 }
393 
394 /* Applies each default Uri_CREATE flags to 'flags' if it
395  * doesn't cause a flag conflict.
396  */
398  if(!(*flags & Uri_CREATE_NO_CANONICALIZE))
399  *flags |= Uri_CREATE_CANONICALIZE;
400  if(!(*flags & Uri_CREATE_NO_DECODE_EXTRA_INFO))
401  *flags |= Uri_CREATE_DECODE_EXTRA_INFO;
402  if(!(*flags & Uri_CREATE_NO_CRACK_UNKNOWN_SCHEMES))
403  *flags |= Uri_CREATE_CRACK_UNKNOWN_SCHEMES;
404  if(!(*flags & Uri_CREATE_NO_PRE_PROCESS_HTML_URI))
405  *flags |= Uri_CREATE_PRE_PROCESS_HTML_URI;
406  if(!(*flags & Uri_CREATE_IE_SETTINGS))
407  *flags |= Uri_CREATE_NO_IE_SETTINGS;
408 }
409 
410 /* Determines if the URI is hierarchical using the information already parsed into
411  * data and using the current location of parsing in the URI string.
412  *
413  * Windows considers a URI hierarchical if one of the following is true:
414  * A.) It's a wildcard scheme.
415  * B.) It's an implicit file scheme.
416  * C.) It's a known hierarchical scheme and it has two '\\' after the scheme name.
417  * (the '\\' will be converted into "//" during canonicalization).
418  * D.) "//" appears after the scheme name (or at the beginning if no scheme is given).
419  */
420 static inline BOOL is_hierarchical_uri(const WCHAR **ptr, const parse_data *data) {
421  const WCHAR *start = *ptr;
422 
423  if(data->scheme_type == URL_SCHEME_WILDCARD)
424  return TRUE;
425  else if(data->scheme_type == URL_SCHEME_FILE && data->has_implicit_scheme)
426  return TRUE;
427  else if(is_hierarchical_scheme(data->scheme_type) && (*ptr)[0] == '\\' && (*ptr)[1] == '\\') {
428  *ptr += 2;
429  return TRUE;
430  } else if(data->scheme_type != URL_SCHEME_MAILTO && check_hierarchical(ptr))
431  return TRUE;
432 
433  *ptr = start;
434  return FALSE;
435 }
436 
437 /* Computes the size of the given IPv6 address.
438  * Each h16 component is 16 bits. If there is an IPv4 address, it's
439  * 32 bits. If there's an elision it can be 16 to 128 bits, depending
440  * on the number of other components.
441  *
442  * Modeled after google-url's CheckIPv6ComponentsSize function
443  */
445  address->components_size = address->h16_count * 2;
446 
447  if(address->ipv4)
448  /* IPv4 address is 4 bytes. */
449  address->components_size += 4;
450 
451  if(address->elision) {
452  /* An elision can be anywhere from 2 bytes up to 16 bytes.
453  * Its size depends on the size of the h16 and IPv4 components.
454  */
455  address->elision_size = 16 - address->components_size;
456  if(address->elision_size < 2)
457  address->elision_size = 2;
458  } else
459  address->elision_size = 0;
460 }
461 
462 /* Taken from dlls/jscript/lex.c */
463 static int hex_to_int(WCHAR val) {
464  if(val >= '0' && val <= '9')
465  return val - '0';
466  else if(val >= 'a' && val <= 'f')
467  return val - 'a' + 10;
468  else if(val >= 'A' && val <= 'F')
469  return val - 'A' + 10;
470 
471  return -1;
472 }
473 
474 /* Helper function for converting a percent encoded string
475  * representation of a WCHAR value into its actual WCHAR value. If
476  * the two characters following the '%' aren't valid hex values then
477  * this function returns the NULL character.
478  *
479  * E.g.
480  * "%2E" will result in '.' being returned by this function.
481  */
482 static WCHAR decode_pct_val(const WCHAR *ptr) {
483  WCHAR ret = '\0';
484 
485  if(*ptr == '%' && is_hexdigit(*(ptr + 1)) && is_hexdigit(*(ptr + 2))) {
486  INT a = hex_to_int(*(ptr + 1));
487  INT b = hex_to_int(*(ptr + 2));
488 
489  ret = a << 4;
490  ret += b;
491  }
492 
493  return ret;
494 }
495 
496 /* Helper function for percent encoding a given character
497  * and storing the encoded value into a given buffer (dest).
498  *
499  * It's up to the calling function to ensure that there is
500  * at least enough space in 'dest' for the percent encoded
501  * value to be stored (so dest + 3 spaces available).
502  */
503 static inline void pct_encode_val(WCHAR val, WCHAR *dest) {
504  dest[0] = '%';
505  dest[1] = hexDigits[(val >> 4) & 0xf];
506  dest[2] = hexDigits[val & 0xf];
507 }
508 
509 /* Attempts to parse the domain name from the host.
510  *
511  * This function also includes the Top-level Domain (TLD) name
512  * of the host when it tries to find the domain name. If it finds
513  * a valid domain name it will assign 'domain_start' the offset
514  * into 'host' where the domain name starts.
515  *
516  * It's implied that if there is a domain name its range is:
517  * [host+domain_start, host+host_len).
518  */
519 void find_domain_name(const WCHAR *host, DWORD host_len,
520  INT *domain_start) {
521  const WCHAR *last_tld, *sec_last_tld, *end;
522 
523  end = host+host_len-1;
524 
525  *domain_start = -1;
526 
527  /* There has to be at least enough room for a '.' followed by a
528  * 3-character TLD for a domain to even exist in the host name.
529  */
530  if(host_len < 4)
531  return;
532 
533  last_tld = memrchrW(host, '.', host_len);
534  if(!last_tld)
535  /* http://hostname -> has no domain name. */
536  return;
537 
538  sec_last_tld = memrchrW(host, '.', last_tld-host);
539  if(!sec_last_tld) {
540  /* If the '.' is at the beginning of the host there
541  * has to be at least 3 characters in the TLD for it
542  * to be valid.
543  * Ex: .com -> .com as the domain name.
544  * .co -> has no domain name.
545  */
546  if(last_tld-host == 0) {
547  if(end-(last_tld-1) < 3)
548  return;
549  } else if(last_tld-host == 3) {
550  DWORD i;
551 
552  /* If there are three characters in front of last_tld and
553  * they are on the list of recognized TLDs, then this
554  * host doesn't have a domain (since the host only contains
555  * a TLD name.
556  * Ex: edu.uk -> has no domain name.
557  * foo.uk -> foo.uk as the domain name.
558  */
559  for(i = 0; i < ARRAY_SIZE(recognized_tlds); ++i) {
561  return;
562  }
563  } else if(last_tld-host < 3)
564  /* Anything less than 3 characters is considered part
565  * of the TLD name.
566  * Ex: ak.uk -> Has no domain name.
567  */
568  return;
569 
570  /* Otherwise the domain name is the whole host name. */
571  *domain_start = 0;
572  } else if(end+1-last_tld > 3) {
573  /* If the last_tld has more than 3 characters, then it's automatically
574  * considered the TLD of the domain name.
575  * Ex: www.winehq.org.uk.test -> uk.test as the domain name.
576  */
577  *domain_start = (sec_last_tld+1)-host;
578  } else if(last_tld - (sec_last_tld+1) < 4) {
579  DWORD i;
580  /* If the sec_last_tld is 3 characters long it HAS to be on the list of
581  * recognized to still be considered part of the TLD name, otherwise
582  * it's considered the domain name.
583  * Ex: www.google.com.uk -> google.com.uk as the domain name.
584  * www.google.foo.uk -> foo.uk as the domain name.
585  */
586  if(last_tld - (sec_last_tld+1) == 3) {
587  for(i = 0; i < ARRAY_SIZE(recognized_tlds); ++i) {
588  if(!StrCmpNIW(sec_last_tld+1, recognized_tlds[i].tld_name, 3)) {
589  const WCHAR *domain = memrchrW(host, '.', sec_last_tld-host);
590 
591  if(!domain)
592  *domain_start = 0;
593  else
594  *domain_start = (domain+1) - host;
595  TRACE("Found domain name %s\n", debugstr_wn(host+*domain_start,
596  (host+host_len)-(host+*domain_start)));
597  return;
598  }
599  }
600 
601  *domain_start = (sec_last_tld+1)-host;
602  } else {
603  /* Since the sec_last_tld is less than 3 characters it's considered
604  * part of the TLD.
605  * Ex: www.google.fo.uk -> google.fo.uk as the domain name.
606  */
607  const WCHAR *domain = memrchrW(host, '.', sec_last_tld-host);
608 
609  if(!domain)
610  *domain_start = 0;
611  else
612  *domain_start = (domain+1) - host;
613  }
614  } else {
615  /* The second to last TLD has more than 3 characters making it
616  * the domain name.
617  * Ex: www.google.test.us -> test.us as the domain name.
618  */
619  *domain_start = (sec_last_tld+1)-host;
620  }
621 
622  TRACE("Found domain name %s\n", debugstr_wn(host+*domain_start,
623  (host+host_len)-(host+*domain_start)));
624 }
625 
626 /* Removes the dot segments from a hierarchical URIs path component. This
627  * function performs the removal in place.
628  *
629  * This function returns the new length of the path string.
630  */
632  WCHAR *out = path;
633  const WCHAR *in = out;
634  const WCHAR *end = out + path_len;
635  DWORD len;
636 
637  while(in < end) {
638  /* Move the first path segment in the input buffer to the end of
639  * the output buffer, and any subsequent characters up to, including
640  * the next "/" character (if any) or the end of the input buffer.
641  */
642  while(in < end && !is_slash(*in))
643  *out++ = *in++;
644  if(in == end)
645  break;
646  *out++ = *in++;
647 
648  while(in < end) {
649  if(*in != '.')
650  break;
651 
652  /* Handle ending "/." */
653  if(in + 1 == end) {
654  ++in;
655  break;
656  }
657 
658  /* Handle "/./" */
659  if(is_slash(in[1])) {
660  in += 2;
661  continue;
662  }
663 
664  /* If we don't have "/../" or ending "/.." */
665  if(in[1] != '.' || (in + 2 != end && !is_slash(in[2])))
666  break;
667 
668  /* Find the slash preceding out pointer and move out pointer to it */
669  if(out > path+1 && is_slash(*--out))
670  --out;
671  while(out > path && !is_slash(*(--out)));
672  if(is_slash(*out))
673  ++out;
674  in += 2;
675  if(in != end)
676  ++in;
677  }
678  }
679 
680  len = out - path;
681  TRACE("(%p %d): Path after dot segments removed %s len=%d\n", path, path_len,
682  debugstr_wn(path, len), len);
683  return len;
684 }
685 
686 /* Attempts to find the file extension in a given path. */
688  const WCHAR *end;
689 
690  for(end = path+path_len-1; end >= path && *end != '/' && *end != '\\'; --end) {
691  if(*end == '.')
692  return end-path;
693  }
694 
695  return -1;
696 }
697 
698 /* Computes the location where the elision should occur in the IPv6
699  * address using the numerical values of each component stored in
700  * 'values'. If the address shouldn't contain an elision then 'index'
701  * is assigned -1 as its value. Otherwise 'index' will contain the
702  * starting index (into values) where the elision should be, and 'count'
703  * will contain the number of cells the elision covers.
704  *
705  * NOTES:
706  * Windows will expand an elision if the elision only represents one h16
707  * component of the address.
708  *
709  * Ex: [1::2:3:4:5:6:7] -> [1:0:2:3:4:5:6:7]
710  *
711  * If the IPv6 address contains an IPv4 address, the IPv4 address is also
712  * considered for being included as part of an elision if all its components
713  * are zeros.
714  *
715  * Ex: [1:2:3:4:5:6:0.0.0.0] -> [1:2:3:4:5:6::]
716  */
718  INT *index, DWORD *count) {
719  DWORD i, max_len, cur_len;
720  INT max_index, cur_index;
721 
722  max_len = cur_len = 0;
723  max_index = cur_index = -1;
724  for(i = 0; i < 8; ++i) {
725  BOOL check_ipv4 = (address->ipv4 && i == 6);
726  BOOL is_end = (check_ipv4 || i == 7);
727 
728  if(check_ipv4) {
729  /* Check if the IPv4 address contains only zeros. */
730  if(values[i] == 0 && values[i+1] == 0) {
731  if(cur_index == -1)
732  cur_index = i;
733 
734  cur_len += 2;
735  ++i;
736  }
737  } else if(values[i] == 0) {
738  if(cur_index == -1)
739  cur_index = i;
740 
741  ++cur_len;
742  }
743 
744  if(is_end || values[i] != 0) {
745  /* We only consider it for an elision if it's
746  * more than 1 component long.
747  */
748  if(cur_len > 1 && cur_len > max_len) {
749  /* Found the new elision location. */
750  max_len = cur_len;
751  max_index = cur_index;
752  }
753 
754  /* Reset the current range for the next range of zeros. */
755  cur_index = -1;
756  cur_len = 0;
757  }
758  }
759 
760  *index = max_index;
761  *count = max_len;
762 }
763 
764 /* Removes all the leading and trailing white spaces or
765  * control characters from the URI and removes all control
766  * characters inside of the URI string.
767  */
769  const WCHAR *start, *end, *ptr;
770  WCHAR *ptr2;
771  DWORD len;
772  BSTR ret;
773 
774  start = uri;
775  /* Skip leading controls and whitespace. */
776  while(*start && (iscntrlW(*start) || isspaceW(*start))) ++start;
777 
778  /* URI consisted only of control/whitespace. */
779  if(!*start)
780  return SysAllocStringLen(NULL, 0);
781 
782  end = start + strlenW(start);
783  while(--end > start && (iscntrlW(*end) || isspaceW(*end)));
784 
785  len = ++end - start;
786  for(ptr = start; ptr < end; ptr++) {
787  if(iscntrlW(*ptr))
788  len--;
789  }
790 
792  if(!ret)
793  return NULL;
794 
795  for(ptr = start, ptr2=ret; ptr < end; ptr++) {
796  if(!iscntrlW(*ptr))
797  *ptr2++ = *ptr;
798  }
799 
800  return ret;
801 }
802 
803 /* Converts the specified IPv4 address into an uint value.
804  *
805  * This function assumes that the IPv4 address has already been validated.
806  */
807 static UINT ipv4toui(const WCHAR *ip, DWORD len) {
808  UINT ret = 0;
809  DWORD comp_value = 0;
810  const WCHAR *ptr;
811 
812  for(ptr = ip; ptr < ip+len; ++ptr) {
813  if(*ptr == '.') {
814  ret <<= 8;
815  ret += comp_value;
816  comp_value = 0;
817  } else
818  comp_value = comp_value*10 + (*ptr-'0');
819  }
820 
821  ret <<= 8;
822  ret += comp_value;
823 
824  return ret;
825 }
826 
827 /* Converts an IPv4 address in numerical form into its fully qualified
828  * string form. This function returns the number of characters written
829  * to 'dest'. If 'dest' is NULL this function will return the number of
830  * characters that would have been written.
831  *
832  * It's up to the caller to ensure there's enough space in 'dest' for the
833  * address.
834  */
836  static const WCHAR formatW[] =
837  {'%','u','.','%','u','.','%','u','.','%','u',0};
838  DWORD ret = 0;
839  UCHAR digits[4];
840 
841  digits[0] = (address >> 24) & 0xff;
842  digits[1] = (address >> 16) & 0xff;
843  digits[2] = (address >> 8) & 0xff;
844  digits[3] = address & 0xff;
845 
846  if(!dest) {
847  WCHAR tmp[16];
848  ret = sprintfW(tmp, formatW, digits[0], digits[1], digits[2], digits[3]);
849  } else
850  ret = sprintfW(dest, formatW, digits[0], digits[1], digits[2], digits[3]);
851 
852  return ret;
853 }
854 
856  static const WCHAR formatW[] = {'%','u',0};
857  DWORD ret = 0;
858 
859  if(!dest) {
860  WCHAR tmp[11];
861  ret = sprintfW(tmp, formatW, value);
862  } else
863  ret = sprintfW(dest, formatW, value);
864 
865  return ret;
866 }
867 
868 /* Converts a h16 component (from an IPv6 address) into its
869  * numerical value.
870  *
871  * This function assumes that the h16 component has already been validated.
872  */
873 static USHORT h16tous(h16 component) {
874  DWORD i;
875  USHORT ret = 0;
876 
877  for(i = 0; i < component.len; ++i) {
878  ret <<= 4;
879  ret += hex_to_int(component.str[i]);
880  }
881 
882  return ret;
883 }
884 
885 /* Converts an IPv6 address into its 128 bits (16 bytes) numerical value.
886  *
887  * This function assumes that the ipv6_address has already been validated.
888  */
890  DWORD i, cur_component = 0;
891  BOOL already_passed_elision = FALSE;
892 
893  for(i = 0; i < address->h16_count; ++i) {
894  if(address->elision) {
895  if(address->components[i].str > address->elision && !already_passed_elision) {
896  /* Means we just passed the elision and need to add its values to
897  * 'number' before we do anything else.
898  */
899  INT j;
900  for(j = 0; j < address->elision_size; j+=2)
901  number[cur_component++] = 0;
902 
903  already_passed_elision = TRUE;
904  }
905  }
906 
907  number[cur_component++] = h16tous(address->components[i]);
908  }
909 
910  /* Case when the elision appears after the h16 components. */
911  if(!already_passed_elision && address->elision) {
912  INT j;
913  for(j = 0; j < address->elision_size; j+=2)
914  number[cur_component++] = 0;
915  }
916 
917  if(address->ipv4) {
918  UINT value = ipv4toui(address->ipv4, address->ipv4_len);
919 
920  if(cur_component != 6) {
921  ERR("(%p %p): Failed sanity check with %d\n", address, number, cur_component);
922  return FALSE;
923  }
924 
925  number[cur_component++] = (value >> 16) & 0xffff;
926  number[cur_component] = value & 0xffff;
927  }
928 
929  return TRUE;
930 }
931 
932 /* Checks if the characters pointed to by 'ptr' are
933  * a percent encoded data octet.
934  *
935  * pct-encoded = "%" HEXDIG HEXDIG
936  */
937 static BOOL check_pct_encoded(const WCHAR **ptr) {
938  const WCHAR *start = *ptr;
939 
940  if(**ptr != '%')
941  return FALSE;
942 
943  ++(*ptr);
944  if(!is_hexdigit(**ptr)) {
945  *ptr = start;
946  return FALSE;
947  }
948 
949  ++(*ptr);
950  if(!is_hexdigit(**ptr)) {
951  *ptr = start;
952  return FALSE;
953  }
954 
955  ++(*ptr);
956  return TRUE;
957 }
958 
959 /* dec-octet = DIGIT ; 0-9
960  * / %x31-39 DIGIT ; 10-99
961  * / "1" 2DIGIT ; 100-199
962  * / "2" %x30-34 DIGIT ; 200-249
963  * / "25" %x30-35 ; 250-255
964  */
965 static BOOL check_dec_octet(const WCHAR **ptr) {
966  const WCHAR *c1, *c2, *c3;
967 
968  c1 = *ptr;
969  /* A dec-octet must be at least 1 digit long. */
970  if(*c1 < '0' || *c1 > '9')
971  return FALSE;
972 
973  ++(*ptr);
974 
975  c2 = *ptr;
976  /* Since the 1-digit requirement was met, it doesn't
977  * matter if this is a DIGIT value, it's considered a
978  * dec-octet.
979  */
980  if(*c2 < '0' || *c2 > '9')
981  return TRUE;
982 
983  ++(*ptr);
984 
985  c3 = *ptr;
986  /* Same explanation as above. */
987  if(*c3 < '0' || *c3 > '9')
988  return TRUE;
989 
990  /* Anything > 255 isn't a valid IP dec-octet. */
991  if(*c1 >= '2' && *c2 >= '5' && *c3 >= '5') {
992  *ptr = c1;
993  return FALSE;
994  }
995 
996  ++(*ptr);
997  return TRUE;
998 }
999 
1000 /* Checks if there is an implicit IPv4 address in the host component of the URI.
1001  * The max value of an implicit IPv4 address is UINT_MAX.
1002  *
1003  * Ex:
1004  * "234567" would be considered an implicit IPv4 address.
1005  */
1007  const WCHAR *start = *ptr;
1008  ULONGLONG ret = 0;
1009  *val = 0;
1010 
1011  while(is_num(**ptr)) {
1012  ret = ret*10 + (**ptr - '0');
1013 
1014  if(ret > UINT_MAX) {
1015  *ptr = start;
1016  return FALSE;
1017  }
1018  ++(*ptr);
1019  }
1020 
1021  if(*ptr == start)
1022  return FALSE;
1023 
1024  *val = ret;
1025  return TRUE;
1026 }
1027 
1028 /* Checks if the string contains an IPv4 address.
1029  *
1030  * This function has a strict mode or a non-strict mode of operation
1031  * When 'strict' is set to FALSE this function will return TRUE if
1032  * the string contains at least 'dec-octet "." dec-octet' since partial
1033  * IPv4 addresses will be normalized out into full IPv4 addresses. When
1034  * 'strict' is set this function expects there to be a full IPv4 address.
1035  *
1036  * IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
1037  */
1039  const WCHAR *start = *ptr;
1040 
1041  if(!check_dec_octet(ptr)) {
1042  *ptr = start;
1043  return FALSE;
1044  }
1045 
1046  if(**ptr != '.') {
1047  *ptr = start;
1048  return FALSE;
1049  }
1050 
1051  ++(*ptr);
1052  if(!check_dec_octet(ptr)) {
1053  *ptr = start;
1054  return FALSE;
1055  }
1056 
1057  if(**ptr != '.') {
1058  if(strict) {
1059  *ptr = start;
1060  return FALSE;
1061  } else
1062  return TRUE;
1063  }
1064 
1065  ++(*ptr);
1066  if(!check_dec_octet(ptr)) {
1067  *ptr = start;
1068  return FALSE;
1069  }
1070 
1071  if(**ptr != '.') {
1072  if(strict) {
1073  *ptr = start;
1074  return FALSE;
1075  } else
1076  return TRUE;
1077  }
1078 
1079  ++(*ptr);
1080  if(!check_dec_octet(ptr)) {
1081  *ptr = start;
1082  return FALSE;
1083  }
1084 
1085  /* Found a four digit ip address. */
1086  return TRUE;
1087 }
1088 /* Tries to parse the scheme name of the URI.
1089  *
1090  * scheme = ALPHA *(ALPHA | NUM | '+' | '-' | '.') as defined by RFC 3896.
1091  * NOTE: Windows accepts a number as the first character of a scheme.
1092  */
1093 static BOOL parse_scheme_name(const WCHAR **ptr, parse_data *data, DWORD extras) {
1094  const WCHAR *start = *ptr;
1095 
1096  data->scheme = NULL;
1097  data->scheme_len = 0;
1098 
1099  while(**ptr) {
1100  if(**ptr == '*' && *ptr == start) {
1101  /* Might have found a wildcard scheme. If it is the next
1102  * char has to be a ':' for it to be a valid URI
1103  */
1104  ++(*ptr);
1105  break;
1106  } else if(!is_num(**ptr) && !is_alpha(**ptr) && **ptr != '+' &&
1107  **ptr != '-' && **ptr != '.')
1108  break;
1109 
1110  (*ptr)++;
1111  }
1112 
1113  if(*ptr == start)
1114  return FALSE;
1115 
1116  /* Schemes must end with a ':' */
1117  if(**ptr != ':' && !((extras & ALLOW_NULL_TERM_SCHEME) && !**ptr)) {
1118  *ptr = start;
1119  return FALSE;
1120  }
1121 
1122  data->scheme = start;
1123  data->scheme_len = *ptr - start;
1124 
1125  ++(*ptr);
1126  return TRUE;
1127 }
1128 
1129 /* Tries to deduce the corresponding URL_SCHEME for the given URI. Stores
1130  * the deduced URL_SCHEME in data->scheme_type.
1131  */
1133  /* If there's scheme data then see if it's a recognized scheme. */
1134  if(data->scheme && data->scheme_len) {
1135  DWORD i;
1136 
1137  for(i = 0; i < ARRAY_SIZE(recognized_schemes); ++i) {
1138  if(lstrlenW(recognized_schemes[i].scheme_name) == data->scheme_len) {
1139  /* Has to be a case insensitive compare. */
1140  if(!StrCmpNIW(recognized_schemes[i].scheme_name, data->scheme, data->scheme_len)) {
1141  data->scheme_type = recognized_schemes[i].scheme;
1142  return TRUE;
1143  }
1144  }
1145  }
1146 
1147  /* If we get here it means it's not a recognized scheme. */
1148  data->scheme_type = URL_SCHEME_UNKNOWN;
1149  return TRUE;
1150  } else if(data->is_relative) {
1151  /* Relative URI's have no scheme. */
1152  data->scheme_type = URL_SCHEME_UNKNOWN;
1153  return TRUE;
1154  } else {
1155  /* Should never reach here! what happened... */
1156  FIXME("(%p): Unable to determine scheme type for URI %s\n", data, debugstr_w(data->uri));
1157  return FALSE;
1158  }
1159 }
1160 
1161 /* Tries to parse (or deduce) the scheme_name of a URI. If it can't
1162  * parse a scheme from the URI it will try to deduce the scheme_name and scheme_type
1163  * using the flags specified in 'flags' (if any). Flags that affect how this function
1164  * operates are the Uri_CREATE_ALLOW_* flags.
1165  *
1166  * All parsed/deduced information will be stored in 'data' when the function returns.
1167  *
1168  * Returns TRUE if it was able to successfully parse the information.
1169  */
1170 static BOOL parse_scheme(const WCHAR **ptr, parse_data *data, DWORD flags, DWORD extras) {
1171  static const WCHAR fileW[] = {'f','i','l','e',0};
1172  static const WCHAR wildcardW[] = {'*',0};
1173 
1174  /* First check to see if the uri could implicitly be a file path. */
1175  if(is_implicit_file_path(*ptr)) {
1176  if(flags & Uri_CREATE_ALLOW_IMPLICIT_FILE_SCHEME) {
1177  data->scheme = fileW;
1178  data->scheme_len = lstrlenW(fileW);
1179  data->has_implicit_scheme = TRUE;
1180 
1181  TRACE("(%p %p %x): URI is an implicit file path.\n", ptr, data, flags);
1182  } else {
1183  /* Windows does not consider anything that can implicitly be a file
1184  * path to be a valid URI if the ALLOW_IMPLICIT_FILE_SCHEME flag is not set...
1185  */
1186  TRACE("(%p %p %x): URI is implicitly a file path, but, the ALLOW_IMPLICIT_FILE_SCHEME flag wasn't set.\n",
1187  ptr, data, flags);
1188  return FALSE;
1189  }
1190  } else if(!parse_scheme_name(ptr, data, extras)) {
1191  /* No scheme was found, this means it could be:
1192  * a) an implicit Wildcard scheme
1193  * b) a relative URI
1194  * c) an invalid URI.
1195  */
1196  if(flags & Uri_CREATE_ALLOW_IMPLICIT_WILDCARD_SCHEME) {
1197  data->scheme = wildcardW;
1198  data->scheme_len = lstrlenW(wildcardW);
1199  data->has_implicit_scheme = TRUE;
1200 
1201  TRACE("(%p %p %x): URI is an implicit wildcard scheme.\n", ptr, data, flags);
1202  } else if (flags & Uri_CREATE_ALLOW_RELATIVE) {
1203  data->is_relative = TRUE;
1204  TRACE("(%p %p %x): URI is relative.\n", ptr, data, flags);
1205  } else {
1206  TRACE("(%p %p %x): Malformed URI found. Unable to deduce scheme name.\n", ptr, data, flags);
1207  return FALSE;
1208  }
1209  }
1210 
1211  if(!data->is_relative)
1212  TRACE("(%p %p %x): Found scheme=%s scheme_len=%d\n", ptr, data, flags,
1213  debugstr_wn(data->scheme, data->scheme_len), data->scheme_len);
1214 
1215  if(!parse_scheme_type(data))
1216  return FALSE;
1217 
1218  TRACE("(%p %p %x): Assigned %d as the URL_SCHEME.\n", ptr, data, flags, data->scheme_type);
1219  return TRUE;
1220 }
1221 
1223  data->username = *ptr;
1224 
1225  while(**ptr != ':' && **ptr != '@') {
1226  if(**ptr == '%') {
1227  if(!check_pct_encoded(ptr)) {
1228  if(data->scheme_type != URL_SCHEME_UNKNOWN) {
1229  *ptr = data->username;
1230  data->username = NULL;
1231  return FALSE;
1232  }
1233  } else
1234  continue;
1235  } else if(extras & ALLOW_NULL_TERM_USER_NAME && !**ptr)
1236  break;
1237  else if(is_auth_delim(**ptr, data->scheme_type != URL_SCHEME_UNKNOWN)) {
1238  *ptr = data->username;
1239  data->username = NULL;
1240  return FALSE;
1241  }
1242 
1243  ++(*ptr);
1244  }
1245 
1246  data->username_len = *ptr - data->username;
1247  return TRUE;
1248 }
1249 
1251  data->password = *ptr;
1252 
1253  while(**ptr != '@') {
1254  if(**ptr == '%') {
1255  if(!check_pct_encoded(ptr)) {
1256  if(data->scheme_type != URL_SCHEME_UNKNOWN) {
1257  *ptr = data->password;
1258  data->password = NULL;
1259  return FALSE;
1260  }
1261  } else
1262  continue;
1263  } else if(extras & ALLOW_NULL_TERM_PASSWORD && !**ptr)
1264  break;
1265  else if(is_auth_delim(**ptr, data->scheme_type != URL_SCHEME_UNKNOWN)) {
1266  *ptr = data->password;
1267  data->password = NULL;
1268  return FALSE;
1269  }
1270 
1271  ++(*ptr);
1272  }
1273 
1274  data->password_len = *ptr - data->password;
1275  return TRUE;
1276 }
1277 
1278 /* Parses the userinfo part of the URI (if it exists). The userinfo field of
1279  * a URI can consist of "username:password@", or just "username@".
1280  *
1281  * RFC def:
1282  * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
1283  *
1284  * NOTES:
1285  * 1) If there is more than one ':' in the userinfo part of the URI Windows
1286  * uses the first occurrence of ':' to delimit the username and password
1287  * components.
1288  *
1289  * ex:
1290  * ftp://user:pass:word@winehq.org
1291  *
1292  * would yield "user" as the username and "pass:word" as the password.
1293  *
1294  * 2) Windows allows any character to appear in the "userinfo" part of
1295  * a URI, as long as it's not an authority delimiter character set.
1296  */
1297 static void parse_userinfo(const WCHAR **ptr, parse_data *data, DWORD flags) {
1298  const WCHAR *start = *ptr;
1299 
1300  if(!parse_username(ptr, data, flags, 0)) {
1301  TRACE("(%p %p %x): URI contained no userinfo.\n", ptr, data, flags);
1302  return;
1303  }
1304 
1305  if(**ptr == ':') {
1306  ++(*ptr);
1307  if(!parse_password(ptr, data, flags, 0)) {
1308  *ptr = start;
1309  data->username = NULL;
1310  data->username_len = 0;
1311  TRACE("(%p %p %x): URI contained no userinfo.\n", ptr, data, flags);
1312  return;
1313  }
1314  }
1315 
1316  if(**ptr != '@') {
1317  *ptr = start;
1318  data->username = NULL;
1319  data->username_len = 0;
1320  data->password = NULL;
1321  data->password_len = 0;
1322 
1323  TRACE("(%p %p %x): URI contained no userinfo.\n", ptr, data, flags);
1324  return;
1325  }
1326 
1327  if(data->username)
1328  TRACE("(%p %p %x): Found username %s len=%d.\n", ptr, data, flags,
1329  debugstr_wn(data->username, data->username_len), data->username_len);
1330 
1331  if(data->password)
1332  TRACE("(%p %p %x): Found password %s len=%d.\n", ptr, data, flags,
1333  debugstr_wn(data->password, data->password_len), data->password_len);
1334 
1335  ++(*ptr);
1336 }
1337 
1338 /* Attempts to parse a port from the URI.
1339  *
1340  * NOTES:
1341  * Windows seems to have a cap on what the maximum value
1342  * for a port can be. The max value is USHORT_MAX.
1343  *
1344  * port = *DIGIT
1345  */
1347  UINT port = 0;
1348  data->port = *ptr;
1349 
1350  while(!is_auth_delim(**ptr, data->scheme_type != URL_SCHEME_UNKNOWN)) {
1351  if(!is_num(**ptr)) {
1352  *ptr = data->port;
1353  data->port = NULL;
1354  return FALSE;
1355  }
1356 
1357  port = port*10 + (**ptr-'0');
1358 
1359  if(port > USHRT_MAX) {
1360  *ptr = data->port;
1361  data->port = NULL;
1362  return FALSE;
1363  }
1364 
1365  ++(*ptr);
1366  }
1367 
1368  data->has_port = TRUE;
1369  data->port_value = port;
1370  data->port_len = *ptr - data->port;
1371 
1372  TRACE("(%p %p %x): Found port %s len=%d value=%u\n", ptr, data, flags,
1373  debugstr_wn(data->port, data->port_len), data->port_len, data->port_value);
1374  return TRUE;
1375 }
1376 
1377 /* Attempts to parse a IPv4 address from the URI.
1378  *
1379  * NOTES:
1380  * Windows normalizes IPv4 addresses, This means there are three
1381  * possibilities for the URI to contain an IPv4 address.
1382  * 1) A well formed address (ex. 192.2.2.2).
1383  * 2) A partially formed address. For example "192.0" would
1384  * normalize to "192.0.0.0" during canonicalization.
1385  * 3) An implicit IPv4 address. For example "256" would
1386  * normalize to "0.0.1.0" during canonicalization. Also
1387  * note that the maximum value for an implicit IP address
1388  * is UINT_MAX, if the value in the URI exceeds this then
1389  * it is not considered an IPv4 address.
1390  */
1392  const BOOL is_unknown = data->scheme_type == URL_SCHEME_UNKNOWN;
1393  data->host = *ptr;
1394 
1395  if(!check_ipv4address(ptr, FALSE)) {
1396  if(!check_implicit_ipv4(ptr, &data->implicit_ipv4)) {
1397  TRACE("(%p %p %x): URI didn't contain anything looking like an IPv4 address.\n",
1398  ptr, data, flags);
1399  *ptr = data->host;
1400  data->host = NULL;
1401  return FALSE;
1402  } else
1403  data->has_implicit_ip = TRUE;
1404  }
1405 
1406  data->host_len = *ptr - data->host;
1407  data->host_type = Uri_HOST_IPV4;
1408 
1409  /* Check if what we found is the only part of the host name (if it isn't
1410  * we don't have an IPv4 address).
1411  */
1412  if(**ptr == ':') {
1413  ++(*ptr);
1414  if(!parse_port(ptr, data, flags)) {
1415  *ptr = data->host;
1416  data->host = NULL;
1417  return FALSE;
1418  }
1419  } else if(!is_auth_delim(**ptr, !is_unknown)) {
1420  /* Found more data which belongs to the host, so this isn't an IPv4. */
1421  *ptr = data->host;
1422  data->host = NULL;
1423  data->has_implicit_ip = FALSE;
1424  return FALSE;
1425  }
1426 
1427  TRACE("(%p %p %x): IPv4 address found. host=%s host_len=%d host_type=%d\n",
1428  ptr, data, flags, debugstr_wn(data->host, data->host_len),
1429  data->host_len, data->host_type);
1430  return TRUE;
1431 }
1432 
1433 /* Attempts to parse the reg-name from the URI.
1434  *
1435  * Because of the way Windows handles ':' this function also
1436  * handles parsing the port.
1437  *
1438  * reg-name = *( unreserved / pct-encoded / sub-delims )
1439  *
1440  * NOTE:
1441  * Windows allows everything, but, the characters in "auth_delims" and ':'
1442  * to appear in a reg-name, unless it's an unknown scheme type then ':' is
1443  * allowed to appear (even if a valid port isn't after it).
1444  *
1445  * Windows doesn't like host names which start with '[' and end with ']'
1446  * and don't contain a valid IP literal address in between them.
1447  *
1448  * On Windows if a '[' is encountered in the host name the ':' no longer
1449  * counts as a delimiter until you reach the next ']' or an "authority delimiter".
1450  *
1451  * A reg-name CAN be empty.
1452  */
1454  const BOOL has_start_bracket = **ptr == '[';
1455  const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
1456  const BOOL is_res = data->scheme_type == URL_SCHEME_RES;
1457  BOOL inside_brackets = has_start_bracket;
1458 
1459  /* res URIs don't have ports. */
1460  BOOL ignore_col = (extras & IGNORE_PORT_DELIMITER) || is_res;
1461 
1462  /* We have to be careful with file schemes. */
1463  if(data->scheme_type == URL_SCHEME_FILE) {
1464  /* This is because an implicit file scheme could be "C:\\test" and it
1465  * would trick this function into thinking the host is "C", when after
1466  * canonicalization the host would end up being an empty string. A drive
1467  * path can also have a '|' instead of a ':' after the drive letter.
1468  */
1469  if(is_drive_path(*ptr)) {
1470  /* Regular old drive paths have no host type (or host name). */
1471  data->host_type = Uri_HOST_UNKNOWN;
1472  data->host = *ptr;
1473  data->host_len = 0;
1474  return TRUE;
1475  } else if(is_unc_path(*ptr))
1476  /* Skip past the "\\" of a UNC path. */
1477  *ptr += 2;
1478  }
1479 
1480  data->host = *ptr;
1481 
1482  /* For res URIs, everything before the first '/' is
1483  * considered the host.
1484  */
1485  while((!is_res && !is_auth_delim(**ptr, known_scheme)) ||
1486  (is_res && **ptr && **ptr != '/')) {
1487  if(**ptr == ':' && !ignore_col) {
1488  /* We can ignore ':' if we are inside brackets.*/
1489  if(!inside_brackets) {
1490  const WCHAR *tmp = (*ptr)++;
1491 
1492  /* Attempt to parse the port. */
1493  if(!parse_port(ptr, data, flags)) {
1494  /* Windows expects there to be a valid port for known scheme types. */
1495  if(data->scheme_type != URL_SCHEME_UNKNOWN) {
1496  *ptr = data->host;
1497  data->host = NULL;
1498  TRACE("(%p %p %x %x): Expected valid port\n", ptr, data, flags, extras);
1499  return FALSE;
1500  } else
1501  /* Windows gives up on trying to parse a port when it
1502  * encounters an invalid port.
1503  */
1504  ignore_col = TRUE;
1505  } else {
1506  data->host_len = tmp - data->host;
1507  break;
1508  }
1509  }
1510  } else if(**ptr == '%' && (known_scheme && !is_res)) {
1511  /* Has to be a legit % encoded value. */
1512  if(!check_pct_encoded(ptr)) {
1513  *ptr = data->host;
1514  data->host = NULL;
1515  return FALSE;
1516  } else
1517  continue;
1518  } else if(is_res && is_forbidden_dos_path_char(**ptr)) {
1519  *ptr = data->host;
1520  data->host = NULL;
1521  return FALSE;
1522  } else if(**ptr == ']')
1523  inside_brackets = FALSE;
1524  else if(**ptr == '[')
1525  inside_brackets = TRUE;
1526 
1527  ++(*ptr);
1528  }
1529 
1530  if(has_start_bracket) {
1531  /* Make sure the last character of the host wasn't a ']'. */
1532  if(*(*ptr-1) == ']') {
1533  TRACE("(%p %p %x %x): Expected an IP literal inside of the host\n",
1534  ptr, data, flags, extras);
1535  *ptr = data->host;
1536  data->host = NULL;
1537  return FALSE;
1538  }
1539  }
1540 
1541  /* Don't overwrite our length if we found a port earlier. */
1542  if(!data->port)
1543  data->host_len = *ptr - data->host;
1544 
1545  /* If the host is empty, then it's an unknown host type. */
1546  if(data->host_len == 0 || is_res)
1547  data->host_type = Uri_HOST_UNKNOWN;
1548  else
1549  data->host_type = Uri_HOST_DNS;
1550 
1551  TRACE("(%p %p %x %x): Parsed reg-name. host=%s len=%d\n", ptr, data, flags, extras,
1552  debugstr_wn(data->host, data->host_len), data->host_len);
1553  return TRUE;
1554 }
1555 
1556 /* Attempts to parse an IPv6 address out of the URI.
1557  *
1558  * IPv6address = 6( h16 ":" ) ls32
1559  * / "::" 5( h16 ":" ) ls32
1560  * / [ h16 ] "::" 4( h16 ":" ) ls32
1561  * / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
1562  * / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
1563  * / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
1564  * / [ *4( h16 ":" ) h16 ] "::" ls32
1565  * / [ *5( h16 ":" ) h16 ] "::" h16
1566  * / [ *6( h16 ":" ) h16 ] "::"
1567  *
1568  * ls32 = ( h16 ":" h16 ) / IPv4address
1569  * ; least-significant 32 bits of address.
1570  *
1571  * h16 = 1*4HEXDIG
1572  * ; 16 bits of address represented in hexadecimal.
1573  *
1574  * Modeled after google-url's 'DoParseIPv6' function.
1575  */
1577  const WCHAR *start, *cur_start;
1578  ipv6_address ip;
1579 
1580  start = cur_start = *ptr;
1581  memset(&ip, 0, sizeof(ipv6_address));
1582 
1583  for(;; ++(*ptr)) {
1584  /* Check if we're on the last character of the host. */
1585  BOOL is_end = (is_auth_delim(**ptr, data->scheme_type != URL_SCHEME_UNKNOWN)
1586  || **ptr == ']');
1587 
1588  BOOL is_split = (**ptr == ':');
1589  BOOL is_elision = (is_split && !is_end && *(*ptr+1) == ':');
1590 
1591  /* Check if we're at the end of a component, or
1592  * if we're at the end of the IPv6 address.
1593  */
1594  if(is_split || is_end) {
1595  DWORD cur_len = 0;
1596 
1597  cur_len = *ptr - cur_start;
1598 
1599  /* h16 can't have a length > 4. */
1600  if(cur_len > 4) {
1601  *ptr = start;
1602 
1603  TRACE("(%p %p %x): h16 component to long.\n",
1604  ptr, data, flags);
1605  return FALSE;
1606  }
1607 
1608  if(cur_len == 0) {
1609  /* An h16 component can't have the length of 0 unless
1610  * the elision is at the beginning of the address, or
1611  * at the end of the address.
1612  */
1613  if(!((*ptr == start && is_elision) ||
1614  (is_end && (*ptr-2) == ip.elision))) {
1615  *ptr = start;
1616  TRACE("(%p %p %x): IPv6 component cannot have a length of 0.\n",
1617  ptr, data, flags);
1618  return FALSE;
1619  }
1620  }
1621 
1622  if(cur_len > 0) {
1623  /* An IPv6 address can have no more than 8 h16 components. */
1624  if(ip.h16_count >= 8) {
1625  *ptr = start;
1626  TRACE("(%p %p %x): Not a IPv6 address, too many h16 components.\n",
1627  ptr, data, flags);
1628  return FALSE;
1629  }
1630 
1631  ip.components[ip.h16_count].str = cur_start;
1632  ip.components[ip.h16_count].len = cur_len;
1633 
1634  TRACE("(%p %p %x): Found h16 component %s, len=%d, h16_count=%d\n",
1635  ptr, data, flags, debugstr_wn(cur_start, cur_len), cur_len,
1636  ip.h16_count);
1637  ++ip.h16_count;
1638  }
1639  }
1640 
1641  if(is_end)
1642  break;
1643 
1644  if(is_elision) {
1645  /* A IPv6 address can only have 1 elision ('::'). */
1646  if(ip.elision) {
1647  *ptr = start;
1648 
1649  TRACE("(%p %p %x): IPv6 address cannot have 2 elisions.\n",
1650  ptr, data, flags);
1651  return FALSE;
1652  }
1653 
1654  ip.elision = *ptr;
1655  ++(*ptr);
1656  }
1657 
1658  if(is_split)
1659  cur_start = *ptr+1;
1660  else {
1661  if(!check_ipv4address(ptr, TRUE)) {
1662  if(!is_hexdigit(**ptr)) {
1663  /* Not a valid character for an IPv6 address. */
1664  *ptr = start;
1665  return FALSE;
1666  }
1667  } else {
1668  /* Found an IPv4 address. */
1669  ip.ipv4 = cur_start;
1670  ip.ipv4_len = *ptr - cur_start;
1671 
1672  TRACE("(%p %p %x): Found an attached IPv4 address %s len=%d.\n",
1673  ptr, data, flags, debugstr_wn(ip.ipv4, ip.ipv4_len),
1674  ip.ipv4_len);
1675 
1676  /* IPv4 addresses can only appear at the end of a IPv6. */
1677  break;
1678  }
1679  }
1680  }
1681 
1683 
1684  /* Make sure the IPv6 address adds up to 16 bytes. */
1685  if(ip.components_size + ip.elision_size != 16) {
1686  *ptr = start;
1687  TRACE("(%p %p %x): Invalid IPv6 address, did not add up to 16 bytes.\n",
1688  ptr, data, flags);
1689  return FALSE;
1690  }
1691 
1692  if(ip.elision_size == 2) {
1693  /* For some reason on Windows if an elision that represents
1694  * only one h16 component is encountered at the very begin or
1695  * end of an IPv6 address, Windows does not consider it a
1696  * valid IPv6 address.
1697  *
1698  * Ex: [::2:3:4:5:6:7] is not valid, even though the sum
1699  * of all the components == 128bits.
1700  */
1701  if(ip.elision < ip.components[0].str ||
1702  ip.elision > ip.components[ip.h16_count-1].str) {
1703  *ptr = start;
1704  TRACE("(%p %p %x): Invalid IPv6 address. Detected elision of 2 bytes at the beginning or end of the address.\n",
1705  ptr, data, flags);
1706  return FALSE;
1707  }
1708  }
1709 
1710  data->host_type = Uri_HOST_IPV6;
1711  data->has_ipv6 = TRUE;
1712  data->ipv6_address = ip;
1713 
1714  TRACE("(%p %p %x): Found valid IPv6 literal %s len=%d\n",
1716  (int)(*ptr-start));
1717  return TRUE;
1718 }
1719 
1720 /* IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) */
1722  const WCHAR *start = *ptr;
1723 
1724  /* IPvFuture has to start with a 'v' or 'V'. */
1725  if(**ptr != 'v' && **ptr != 'V')
1726  return FALSE;
1727 
1728  /* Following the v there must be at least 1 hex digit. */
1729  ++(*ptr);
1730  if(!is_hexdigit(**ptr)) {
1731  *ptr = start;
1732  return FALSE;
1733  }
1734 
1735  ++(*ptr);
1736  while(is_hexdigit(**ptr))
1737  ++(*ptr);
1738 
1739  /* End of the hexdigit sequence must be a '.' */
1740  if(**ptr != '.') {
1741  *ptr = start;
1742  return FALSE;
1743  }
1744 
1745  ++(*ptr);
1746  if(!is_unreserved(**ptr) && !is_subdelim(**ptr) && **ptr != ':') {
1747  *ptr = start;
1748  return FALSE;
1749  }
1750 
1751  ++(*ptr);
1752  while(is_unreserved(**ptr) || is_subdelim(**ptr) || **ptr == ':')
1753  ++(*ptr);
1754 
1755  data->host_type = Uri_HOST_UNKNOWN;
1756 
1757  TRACE("(%p %p %x): Parsed IPvFuture address %s len=%d\n", ptr, data, flags,
1758  debugstr_wn(start, *ptr-start), (int)(*ptr-start));
1759 
1760  return TRUE;
1761 }
1762 
1763 /* IP-literal = "[" ( IPv6address / IPvFuture ) "]" */
1765  data->host = *ptr;
1766 
1767  if(**ptr != '[' && !(extras & ALLOW_BRACKETLESS_IP_LITERAL)) {
1768  data->host = NULL;
1769  return FALSE;
1770  } else if(**ptr == '[')
1771  ++(*ptr);
1772 
1773  if(!parse_ipv6address(ptr, data, flags)) {
1774  if(extras & SKIP_IP_FUTURE_CHECK || !parse_ipvfuture(ptr, data, flags)) {
1775  *ptr = data->host;
1776  data->host = NULL;
1777  return FALSE;
1778  }
1779  }
1780 
1781  if(**ptr != ']' && !(extras & ALLOW_BRACKETLESS_IP_LITERAL)) {
1782  *ptr = data->host;
1783  data->host = NULL;
1784  return FALSE;
1785  } else if(!**ptr && extras & ALLOW_BRACKETLESS_IP_LITERAL) {
1786  /* The IP literal didn't contain brackets and was followed by
1787  * a NULL terminator, so no reason to even check the port.
1788  */
1789  data->host_len = *ptr - data->host;
1790  return TRUE;
1791  }
1792 
1793  ++(*ptr);
1794  if(**ptr == ':') {
1795  ++(*ptr);
1796  /* If a valid port is not found, then let it trickle down to
1797  * parse_reg_name.
1798  */
1799  if(!parse_port(ptr, data, flags)) {
1800  *ptr = data->host;
1801  data->host = NULL;
1802  return FALSE;
1803  }
1804  } else
1805  data->host_len = *ptr - data->host;
1806 
1807  return TRUE;
1808 }
1809 
1810 /* Parses the host information from the URI.
1811  *
1812  * host = IP-literal / IPv4address / reg-name
1813  */
1814 static BOOL parse_host(const WCHAR **ptr, parse_data *data, DWORD flags, DWORD extras) {
1815  if(!parse_ip_literal(ptr, data, flags, extras)) {
1816  if(!parse_ipv4address(ptr, data, flags)) {
1817  if(!parse_reg_name(ptr, data, flags, extras)) {
1818  TRACE("(%p %p %x %x): Malformed URI, Unknown host type.\n",
1819  ptr, data, flags, extras);
1820  return FALSE;
1821  }
1822  }
1823  }
1824 
1825  return TRUE;
1826 }
1827 
1828 /* Parses the authority information from the URI.
1829  *
1830  * authority = [ userinfo "@" ] host [ ":" port ]
1831  */
1834 
1835  /* Parsing the port will happen during one of the host parsing
1836  * routines (if the URI has a port).
1837  */
1838  if(!parse_host(ptr, data, flags, 0))
1839  return FALSE;
1840 
1841  return TRUE;
1842 }
1843 
1844 /* Attempts to parse the path information of a hierarchical URI. */
1846  const WCHAR *start = *ptr;
1847  static const WCHAR slash[] = {'/',0};
1848  const BOOL is_file = data->scheme_type == URL_SCHEME_FILE;
1849 
1850  if(is_path_delim(data->scheme_type, **ptr)) {
1851  if(data->scheme_type == URL_SCHEME_WILDCARD && !data->must_have_path) {
1852  data->path = NULL;
1853  data->path_len = 0;
1854  } else if(!(flags & Uri_CREATE_NO_CANONICALIZE)) {
1855  /* If the path component is empty, then a '/' is added. */
1856  data->path = slash;
1857  data->path_len = 1;
1858  }
1859  } else {
1860  while(!is_path_delim(data->scheme_type, **ptr)) {
1861  if(**ptr == '%' && data->scheme_type != URL_SCHEME_UNKNOWN && !is_file) {
1862  if(!check_pct_encoded(ptr)) {
1863  *ptr = start;
1864  return FALSE;
1865  } else
1866  continue;
1867  } else if(is_forbidden_dos_path_char(**ptr) && is_file &&
1868  (flags & Uri_CREATE_FILE_USE_DOS_PATH)) {
1869  /* File schemes with USE_DOS_PATH set aren't allowed to have
1870  * a '<' or '>' or '\"' appear in them.
1871  */
1872  *ptr = start;
1873  return FALSE;
1874  } else if(**ptr == '\\') {
1875  /* Not allowed to have a backslash if NO_CANONICALIZE is set
1876  * and the scheme is known type (but not a file scheme).
1877  */
1878  if(flags & Uri_CREATE_NO_CANONICALIZE) {
1879  if(data->scheme_type != URL_SCHEME_FILE &&
1880  data->scheme_type != URL_SCHEME_UNKNOWN) {
1881  *ptr = start;
1882  return FALSE;
1883  }
1884  }
1885  }
1886 
1887  ++(*ptr);
1888  }
1889 
1890  /* The only time a URI doesn't have a path is when
1891  * the NO_CANONICALIZE flag is set and the raw URI
1892  * didn't contain one.
1893  */
1894  if(*ptr == start) {
1895  data->path = NULL;
1896  data->path_len = 0;
1897  } else {
1898  data->path = start;
1899  data->path_len = *ptr - start;
1900  }
1901  }
1902 
1903  if(data->path)
1904  TRACE("(%p %p %x): Parsed path %s len=%d\n", ptr, data, flags,
1905  debugstr_wn(data->path, data->path_len), data->path_len);
1906  else
1907  TRACE("(%p %p %x): The URI contained no path\n", ptr, data, flags);
1908 
1909  return TRUE;
1910 }
1911 
1912 /* Parses the path of an opaque URI (much less strict than the parser
1913  * for a hierarchical URI).
1914  *
1915  * NOTE:
1916  * Windows allows invalid % encoded data to appear in opaque URI paths
1917  * for unknown scheme types.
1918  *
1919  * File schemes with USE_DOS_PATH set aren't allowed to have '<', '>', or '\"'
1920  * appear in them.
1921  */
1923  const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
1924  const BOOL is_file = data->scheme_type == URL_SCHEME_FILE;
1925  const BOOL is_mailto = data->scheme_type == URL_SCHEME_MAILTO;
1926 
1927  if (is_mailto && (*ptr)[0] == '/' && (*ptr)[1] == '/')
1928  {
1929  if ((*ptr)[2]) data->path = *ptr + 2;
1930  else data->path = NULL;
1931  }
1932  else
1933  data->path = *ptr;
1934 
1935  while(!is_path_delim(data->scheme_type, **ptr)) {
1936  if(**ptr == '%' && known_scheme) {
1937  if(!check_pct_encoded(ptr)) {
1938  *ptr = data->path;
1939  data->path = NULL;
1940  return FALSE;
1941  } else
1942  continue;
1943  } else if(is_forbidden_dos_path_char(**ptr) && is_file &&
1944  (flags & Uri_CREATE_FILE_USE_DOS_PATH)) {
1945  *ptr = data->path;
1946  data->path = NULL;
1947  return FALSE;
1948  }
1949 
1950  ++(*ptr);
1951  }
1952 
1953  if (data->path) data->path_len = *ptr - data->path;
1954  TRACE("(%p %p %x): Parsed opaque URI path %s len=%d\n", ptr, data, flags,
1955  debugstr_wn(data->path, data->path_len), data->path_len);
1956  return TRUE;
1957 }
1958 
1959 /* Determines how the URI should be parsed after the scheme information.
1960  *
1961  * If the scheme is followed by "//", then it is treated as a hierarchical URI
1962  * which then the authority and path information will be parsed out. Otherwise, the
1963  * URI will be treated as an opaque URI which the authority information is not parsed
1964  * out.
1965  *
1966  * RFC 3896 definition of hier-part:
1967  *
1968  * hier-part = "//" authority path-abempty
1969  * / path-absolute
1970  * / path-rootless
1971  * / path-empty
1972  *
1973  * MSDN opaque URI definition:
1974  * scheme ":" path [ "#" fragment ]
1975  *
1976  * NOTES:
1977  * If the URI is of an unknown scheme type and has a "//" following the scheme then it
1978  * is treated as a hierarchical URI, but, if the CREATE_NO_CRACK_UNKNOWN_SCHEMES flag is
1979  * set then it is considered an opaque URI regardless of what follows the scheme information
1980  * (per MSDN documentation).
1981  */
1983  const WCHAR *start = *ptr;
1984 
1985  data->must_have_path = FALSE;
1986 
1987  /* For javascript: URIs, simply set everything as a path */
1988  if(data->scheme_type == URL_SCHEME_JAVASCRIPT) {
1989  data->path = *ptr;
1990  data->path_len = strlenW(*ptr);
1991  data->is_opaque = TRUE;
1992  *ptr += data->path_len;
1993  return TRUE;
1994  }
1995 
1996  /* Checks if the authority information needs to be parsed. */
1997  if(is_hierarchical_uri(ptr, data)) {
1998  /* Only treat it as a hierarchical URI if the scheme_type is known or
1999  * the Uri_CREATE_NO_CRACK_UNKNOWN_SCHEMES flag is not set.
2000  */
2001  if(data->scheme_type != URL_SCHEME_UNKNOWN ||
2002  !(flags & Uri_CREATE_NO_CRACK_UNKNOWN_SCHEMES)) {
2003  TRACE("(%p %p %x): Treating URI as an hierarchical URI.\n", ptr, data, flags);
2004  data->is_opaque = FALSE;
2005 
2006  if(data->scheme_type == URL_SCHEME_WILDCARD && !data->has_implicit_scheme) {
2007  if(**ptr == '/' && *(*ptr+1) == '/') {
2008  data->must_have_path = TRUE;
2009  *ptr += 2;
2010  }
2011  }
2012 
2013  /* TODO: Handle hierarchical URI's, parse authority then parse the path. */
2014  if(!parse_authority(ptr, data, flags))
2015  return FALSE;
2016 
2018  } else
2019  /* Reset ptr to its starting position so opaque path parsing
2020  * begins at the correct location.
2021  */
2022  *ptr = start;
2023  }
2024 
2025  /* If it reaches here, then the URI will be treated as an opaque
2026  * URI.
2027  */
2028 
2029  TRACE("(%p %p %x): Treating URI as an opaque URI.\n", ptr, data, flags);
2030 
2031  data->is_opaque = TRUE;
2033  return FALSE;
2034 
2035  return TRUE;
2036 }
2037 
2038 /* Attempts to parse the query string from the URI.
2039  *
2040  * NOTES:
2041  * If NO_DECODE_EXTRA_INFO flag is set, then invalid percent encoded
2042  * data is allowed to appear in the query string. For unknown scheme types
2043  * invalid percent encoded data is allowed to appear regardless.
2044  */
2046  const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
2047 
2048  if(**ptr != '?') {
2049  TRACE("(%p %p %x): URI didn't contain a query string.\n", ptr, data, flags);
2050  return TRUE;
2051  }
2052 
2053  data->query = *ptr;
2054 
2055  ++(*ptr);
2056  while(**ptr && **ptr != '#') {
2057  if(**ptr == '%' && known_scheme &&
2058  !(flags & Uri_CREATE_NO_DECODE_EXTRA_INFO)) {
2059  if(!check_pct_encoded(ptr)) {
2060  *ptr = data->query;
2061  data->query = NULL;
2062  return FALSE;
2063  } else
2064  continue;
2065  }
2066 
2067  ++(*ptr);
2068  }
2069 
2070  data->query_len = *ptr - data->query;
2071 
2072  TRACE("(%p %p %x): Parsed query string %s len=%d\n", ptr, data, flags,
2073  debugstr_wn(data->query, data->query_len), data->query_len);
2074  return TRUE;
2075 }
2076 
2077 /* Attempts to parse the fragment from the URI.
2078  *
2079  * NOTES:
2080  * If NO_DECODE_EXTRA_INFO flag is set, then invalid percent encoded
2081  * data is allowed to appear in the query string. For unknown scheme types
2082  * invalid percent encoded data is allowed to appear regardless.
2083  */
2085  const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
2086 
2087  if(**ptr != '#') {
2088  TRACE("(%p %p %x): URI didn't contain a fragment.\n", ptr, data, flags);
2089  return TRUE;
2090  }
2091 
2092  data->fragment = *ptr;
2093 
2094  ++(*ptr);
2095  while(**ptr) {
2096  if(**ptr == '%' && known_scheme &&
2097  !(flags & Uri_CREATE_NO_DECODE_EXTRA_INFO)) {
2098  if(!check_pct_encoded(ptr)) {
2099  *ptr = data->fragment;
2100  data->fragment = NULL;
2101  return FALSE;
2102  } else
2103  continue;
2104  }
2105 
2106  ++(*ptr);
2107  }
2108 
2109  data->fragment_len = *ptr - data->fragment;
2110 
2111  TRACE("(%p %p %x): Parsed fragment %s len=%d\n", ptr, data, flags,
2112  debugstr_wn(data->fragment, data->fragment_len), data->fragment_len);
2113  return TRUE;
2114 }
2115 
2116 /* Parses and validates the components of the specified by data->uri
2117  * and stores the information it parses into 'data'.
2118  *
2119  * Returns TRUE if it successfully parsed the URI. False otherwise.
2120  */
2122  const WCHAR *ptr;
2123  const WCHAR **pptr;
2124 
2125  ptr = data->uri;
2126  pptr = &ptr;
2127 
2128  TRACE("(%p %x): BEGINNING TO PARSE URI %s.\n", data, flags, debugstr_w(data->uri));
2129 
2130  if(!parse_scheme(pptr, data, flags, 0))
2131  return FALSE;
2132 
2133  if(!parse_hierpart(pptr, data, flags))
2134  return FALSE;
2135 
2136  if(!parse_query(pptr, data, flags))
2137  return FALSE;
2138 
2139  if(!parse_fragment(pptr, data, flags))
2140  return FALSE;
2141 
2142  TRACE("(%p %x): FINISHED PARSING URI.\n", data, flags);
2143  return TRUE;
2144 }
2145 
2146 static BOOL canonicalize_username(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
2147  const WCHAR *ptr;
2148 
2149  if(!data->username) {
2150  uri->userinfo_start = -1;
2151  return TRUE;
2152  }
2153 
2154  uri->userinfo_start = uri->canon_len;
2155  for(ptr = data->username; ptr < data->username+data->username_len; ++ptr) {
2156  if(*ptr == '%') {
2157  /* Only decode % encoded values for known scheme types. */
2158  if(data->scheme_type != URL_SCHEME_UNKNOWN) {
2159  /* See if the value really needs decoding. */
2161  if(is_unreserved(val)) {
2162  if(!computeOnly)
2163  uri->canon_uri[uri->canon_len] = val;
2164 
2165  ++uri->canon_len;
2166 
2167  /* Move pass the hex characters. */
2168  ptr += 2;
2169  continue;
2170  }
2171  }
2172  } else if(is_ascii(*ptr) && !is_reserved(*ptr) && !is_unreserved(*ptr) && *ptr != '\\') {
2173  /* Only percent encode forbidden characters if the NO_ENCODE_FORBIDDEN_CHARACTERS flag
2174  * is NOT set.
2175  */
2176  if(!(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS)) {
2177  if(!computeOnly)
2178  pct_encode_val(*ptr, uri->canon_uri + uri->canon_len);
2179 
2180  uri->canon_len += 3;
2181  continue;
2182  }
2183  }
2184 
2185  if(!computeOnly)
2186  /* Nothing special, so just copy the character over. */
2187  uri->canon_uri[uri->canon_len] = *ptr;
2188  ++uri->canon_len;
2189  }
2190 
2191  return TRUE;
2192 }
2193 
2194 static BOOL canonicalize_password(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
2195  const WCHAR *ptr;
2196 
2197  if(!data->password) {
2198  uri->userinfo_split = -1;
2199  return TRUE;
2200  }
2201 
2202  if(uri->userinfo_start == -1)
2203  /* Has a password, but, doesn't have a username. */
2204  uri->userinfo_start = uri->canon_len;
2205 
2206  uri->userinfo_split = uri->canon_len - uri->userinfo_start;
2207 
2208  /* Add the ':' to the userinfo component. */
2209  if(!computeOnly)
2210  uri->canon_uri[uri->canon_len] = ':';
2211  ++uri->canon_len;
2212 
2213  for(ptr = data->password; ptr < data->password+data->password_len; ++ptr) {
2214  if(*ptr == '%') {
2215  /* Only decode % encoded values for known scheme types. */
2216  if(data->scheme_type != URL_SCHEME_UNKNOWN) {
2217  /* See if the value really needs decoding. */
2219  if(is_unreserved(val)) {
2220  if(!computeOnly)
2221  uri->canon_uri[uri->canon_len] = val;
2222 
2223  ++uri->canon_len;
2224 
2225  /* Move pass the hex characters. */
2226  ptr += 2;
2227  continue;
2228  }
2229  }
2230  } else if(is_ascii(*ptr) && !is_reserved(*ptr) && !is_unreserved(*ptr) && *ptr != '\\') {
2231  /* Only percent encode forbidden characters if the NO_ENCODE_FORBIDDEN_CHARACTERS flag
2232  * is NOT set.
2233  */
2234  if(!(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS)) {
2235  if(!computeOnly)
2236  pct_encode_val(*ptr, uri->canon_uri + uri->canon_len);
2237 
2238  uri->canon_len += 3;
2239  continue;
2240  }
2241  }
2242 
2243  if(!computeOnly)
2244  /* Nothing special, so just copy the character over. */
2245  uri->canon_uri[uri->canon_len] = *ptr;
2246  ++uri->canon_len;
2247  }
2248 
2249  return TRUE;
2250 }
2251 
2252 /* Canonicalizes the userinfo of the URI represented by the parse_data.
2253  *
2254  * Canonicalization of the userinfo is a simple process. If there are any percent
2255  * encoded characters that fall in the "unreserved" character set, they are decoded
2256  * to their actual value. If a character is not in the "unreserved" or "reserved" sets
2257  * then it is percent encoded. Other than that the characters are copied over without
2258  * change.
2259  */
2260 static BOOL canonicalize_userinfo(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
2261  uri->userinfo_start = uri->userinfo_split = -1;
2262  uri->userinfo_len = 0;
2263 
2264  if(!data->username && !data->password)
2265  /* URI doesn't have userinfo, so nothing to do here. */
2266  return TRUE;
2267 
2268  if(!canonicalize_username(data, uri, flags, computeOnly))
2269  return FALSE;
2270 
2271  if(!canonicalize_password(data, uri, flags, computeOnly))
2272  return FALSE;
2273 
2274  uri->userinfo_len = uri->canon_len - uri->userinfo_start;
2275  if(!computeOnly)
2276  TRACE("(%p %p %x %d): Canonicalized userinfo, userinfo_start=%d, userinfo=%s, userinfo_split=%d userinfo_len=%d.\n",
2277  data, uri, flags, computeOnly, uri->userinfo_start, debugstr_wn(uri->canon_uri + uri->userinfo_start, uri->userinfo_len),
2278  uri->userinfo_split, uri->userinfo_len);
2279 
2280  /* Now insert the '@' after the userinfo. */
2281  if(!computeOnly)
2282  uri->canon_uri[uri->canon_len] = '@';
2283  ++uri->canon_len;
2284 
2285  return TRUE;
2286 }
2287 
2288 /* Attempts to canonicalize a reg_name.
2289  *
2290  * Things that happen:
2291  * 1) If Uri_CREATE_NO_CANONICALIZE flag is not set, then the reg_name is
2292  * lower cased. Unless it's an unknown scheme type, which case it's
2293  * no lower cased regardless.
2294  *
2295  * 2) Unreserved % encoded characters are decoded for known
2296  * scheme types.
2297  *
2298  * 3) Forbidden characters are % encoded as long as
2299  * Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS flag is not set and
2300  * it isn't an unknown scheme type.
2301  *
2302  * 4) If it's a file scheme and the host is "localhost" it's removed.
2303  *
2304  * 5) If it's a file scheme and Uri_CREATE_FILE_USE_DOS_PATH is set,
2305  * then the UNC path characters are added before the host name.
2306  */
2308  DWORD flags, BOOL computeOnly) {
2309  static const WCHAR localhostW[] =
2310  {'l','o','c','a','l','h','o','s','t',0};
2311  const WCHAR *ptr;
2312  const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
2313 
2314  if(data->scheme_type == URL_SCHEME_FILE &&
2315  data->host_len == lstrlenW(localhostW)) {
2316  if(!StrCmpNIW(data->host, localhostW, data->host_len)) {
2317  uri->host_start = -1;
2318  uri->host_len = 0;
2319  uri->host_type = Uri_HOST_UNKNOWN;
2320  return TRUE;
2321  }
2322  }
2323 
2324  if(data->scheme_type == URL_SCHEME_FILE && flags & Uri_CREATE_FILE_USE_DOS_PATH) {
2325  if(!computeOnly) {
2326  uri->canon_uri[uri->canon_len] = '\\';
2327  uri->canon_uri[uri->canon_len+1] = '\\';
2328  }
2329  uri->canon_len += 2;
2330  uri->authority_start = uri->canon_len;
2331  }
2332 
2333  uri->host_start = uri->canon_len;
2334 
2335  for(ptr = data->host; ptr < data->host+data->host_len; ++ptr) {
2336  if(*ptr == '%' && known_scheme) {
2338  if(is_unreserved(val)) {
2339  /* If NO_CANONICALIZE is not set, then windows lower cases the
2340  * decoded value.
2341  */
2342  if(!(flags & Uri_CREATE_NO_CANONICALIZE) && isupperW(val)) {
2343  if(!computeOnly)
2344  uri->canon_uri[uri->canon_len] = tolowerW(val);
2345  } else {
2346  if(!computeOnly)
2347  uri->canon_uri[uri->canon_len] = val;
2348  }
2349  ++uri->canon_len;
2350 
2351  /* Skip past the % encoded character. */
2352  ptr += 2;
2353  continue;
2354  } else {
2355  /* Just copy the % over. */
2356  if(!computeOnly)
2357  uri->canon_uri[uri->canon_len] = *ptr;
2358  ++uri->canon_len;
2359  }
2360  } else if(*ptr == '\\') {
2361  /* Only unknown scheme types could have made it here with a '\\' in the host name. */
2362  if(!computeOnly)
2363  uri->canon_uri[uri->canon_len] = *ptr;
2364  ++uri->canon_len;
2365  } else if(!(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS) && is_ascii(*ptr) &&
2366  !is_unreserved(*ptr) && !is_reserved(*ptr) && known_scheme) {
2367  if(!computeOnly) {
2368  pct_encode_val(*ptr, uri->canon_uri+uri->canon_len);
2369 
2370  /* The percent encoded value gets lower cased also. */
2371  if(!(flags & Uri_CREATE_NO_CANONICALIZE)) {
2372  uri->canon_uri[uri->canon_len+1] = tolowerW(uri->canon_uri[uri->canon_len+1]);
2373  uri->canon_uri[uri->canon_len+2] = tolowerW(uri->canon_uri[uri->canon_len+2]);
2374  }
2375  }
2376 
2377  uri->canon_len += 3;
2378  } else {
2379  if(!computeOnly) {
2380  if(!(flags & Uri_CREATE_NO_CANONICALIZE) && known_scheme)
2381  uri->canon_uri[uri->canon_len] = tolowerW(*ptr);
2382  else
2383  uri->canon_uri[uri->canon_len] = *ptr;
2384  }
2385 
2386  ++uri->canon_len;
2387  }
2388  }
2389 
2390  uri->host_len = uri->canon_len - uri->host_start;
2391 
2392  if(!computeOnly)
2393  TRACE("(%p %p %x %d): Canonicalize reg_name=%s len=%d\n", data, uri, flags,
2394  computeOnly, debugstr_wn(uri->canon_uri+uri->host_start, uri->host_len),
2395  uri->host_len);
2396 
2397  if(!computeOnly)
2398  find_domain_name(uri->canon_uri+uri->host_start, uri->host_len,
2399  &(uri->domain_offset));
2400 
2401  return TRUE;
2402 }
2403 
2404 /* Attempts to canonicalize an implicit IPv4 address. */
2406  uri->host_start = uri->canon_len;
2407 
2408  TRACE("%u\n", data->implicit_ipv4);
2409  /* For unknown scheme types Windows doesn't convert
2410  * the value into an IP address, but it still considers
2411  * it an IPv4 address.
2412  */
2413  if(data->scheme_type == URL_SCHEME_UNKNOWN) {
2414  if(!computeOnly)
2415  memcpy(uri->canon_uri+uri->canon_len, data->host, data->host_len*sizeof(WCHAR));
2416  uri->canon_len += data->host_len;
2417  } else {
2418  if(!computeOnly)
2419  uri->canon_len += ui2ipv4(uri->canon_uri+uri->canon_len, data->implicit_ipv4);
2420  else
2421  uri->canon_len += ui2ipv4(NULL, data->implicit_ipv4);
2422  }
2423 
2424  uri->host_len = uri->canon_len - uri->host_start;
2425  uri->host_type = Uri_HOST_IPV4;
2426 
2427  if(!computeOnly)
2428  TRACE("%p %p %x %d): Canonicalized implicit IP address=%s len=%d\n",
2429  data, uri, flags, computeOnly,
2430  debugstr_wn(uri->canon_uri+uri->host_start, uri->host_len),
2431  uri->host_len);
2432 
2433  return TRUE;
2434 }
2435 
2436 /* Attempts to canonicalize an IPv4 address.
2437  *
2438  * If the parse_data represents a URI that has an implicit IPv4 address
2439  * (ex. http://256/, this function will convert 256 into 0.0.1.0). If
2440  * the implicit IP address exceeds the value of UINT_MAX (maximum value
2441  * for an IPv4 address) it's canonicalized as if it were a reg-name.
2442  *
2443  * If the parse_data contains a partial or full IPv4 address it normalizes it.
2444  * A partial IPv4 address is something like "192.0" and would be normalized to
2445  * "192.0.0.0". With a full (or partial) IPv4 address like "192.002.01.003" would
2446  * be normalized to "192.2.1.3".
2447  *
2448  * NOTES:
2449  * Windows ONLY normalizes IPv4 address for known scheme types (one that isn't
2450  * URL_SCHEME_UNKNOWN). For unknown scheme types, it simply copies the data from
2451  * the original URI into the canonicalized URI, but, it still recognizes URI's
2452  * host type as HOST_IPV4.
2453  */
2455  if(data->has_implicit_ip)
2456  return canonicalize_implicit_ipv4address(data, uri, flags, computeOnly);
2457  else {
2458  uri->host_start = uri->canon_len;
2459 
2460  /* Windows only normalizes for known scheme types. */
2461  if(data->scheme_type != URL_SCHEME_UNKNOWN) {
2462  /* parse_data contains a partial or full IPv4 address, so normalize it. */
2463  DWORD i, octetDigitCount = 0, octetCount = 0;
2464  BOOL octetHasDigit = FALSE;
2465 
2466  for(i = 0; i < data->host_len; ++i) {
2467  if(data->host[i] == '0' && !octetHasDigit) {
2468  /* Can ignore leading zeros if:
2469  * 1) It isn't the last digit of the octet.
2470  * 2) i+1 != data->host_len
2471  * 3) i+1 != '.'
2472  */
2473  if(octetDigitCount == 2 ||
2474  i+1 == data->host_len ||
2475  data->host[i+1] == '.') {
2476  if(!computeOnly)
2477  uri->canon_uri[uri->canon_len] = data->host[i];
2478  ++uri->canon_len;
2479  TRACE("Adding zero\n");
2480  }
2481  } else if(data->host[i] == '.') {
2482  if(!computeOnly)
2483  uri->canon_uri[uri->canon_len] = data->host[i];
2484  ++uri->canon_len;
2485 
2486  octetDigitCount = 0;
2487  octetHasDigit = FALSE;
2488  ++octetCount;
2489  } else {
2490  if(!computeOnly)
2491  uri->canon_uri[uri->canon_len] = data->host[i];
2492  ++uri->canon_len;
2493 
2494  ++octetDigitCount;
2495  octetHasDigit = TRUE;
2496  }
2497  }
2498 
2499  /* Make sure the canonicalized IP address has 4 dec-octets.
2500  * If doesn't add "0" ones until there is 4;
2501  */
2502  for( ; octetCount < 3; ++octetCount) {
2503  if(!computeOnly) {
2504  uri->canon_uri[uri->canon_len] = '.';
2505  uri->canon_uri[uri->canon_len+1] = '0';
2506  }
2507 
2508  uri->canon_len += 2;
2509  }
2510  } else {
2511  /* Windows doesn't normalize addresses in unknown schemes. */
2512  if(!computeOnly)
2513  memcpy(uri->canon_uri+uri->canon_len, data->host, data->host_len*sizeof(WCHAR));
2514  uri->canon_len += data->host_len;
2515  }
2516 
2517  uri->host_len = uri->canon_len - uri->host_start;
2518  if(!computeOnly)
2519  TRACE("(%p %p %x %d): Canonicalized IPv4 address, ip=%s len=%d\n",
2520  data, uri, flags, computeOnly,
2521  debugstr_wn(uri->canon_uri+uri->host_start, uri->host_len),
2522  uri->host_len);
2523  }
2524 
2525  return TRUE;
2526 }
2527 
2528 /* Attempts to canonicalize the IPv6 address of the URI.
2529  *
2530  * Multiple things happen during the canonicalization of an IPv6 address:
2531  * 1) Any leading zero's in a h16 component are removed.
2532  * Ex: [0001:0022::] -> [1:22::]
2533  *
2534  * 2) The longest sequence of zero h16 components are compressed
2535  * into a "::" (elision). If there's a tie, the first is chosen.
2536  *
2537  * Ex: [0:0:0:0:1:6:7:8] -> [::1:6:7:8]
2538  * [0:0:0:0:1:2::] -> [::1:2:0:0]
2539  * [0:0:1:2:0:0:7:8] -> [::1:2:0:0:7:8]
2540  *
2541  * 3) If an IPv4 address is attached to the IPv6 address, it's
2542  * also normalized.
2543  * Ex: [::001.002.022.000] -> [::1.2.22.0]
2544  *
2545  * 4) If an elision is present, but, only represents one h16 component
2546  * it's expanded.
2547  *
2548  * Ex: [1::2:3:4:5:6:7] -> [1:0:2:3:4:5:6:7]
2549  *
2550  * 5) If the IPv6 address contains an IPv4 address and there exists
2551  * at least 1 non-zero h16 component the IPv4 address is converted
2552  * into two h16 components, otherwise it's normalized and kept as is.
2553  *
2554  * Ex: [::192.200.003.4] -> [::192.200.3.4]
2555  * [ffff::192.200.003.4] -> [ffff::c0c8:3041]
2556  *
2557  * NOTE:
2558  * For unknown scheme types Windows simply copies the address over without any
2559  * changes.
2560  *
2561  * IPv4 address can be included in an elision if all its components are 0's.
2562  */
2564  DWORD flags, BOOL computeOnly) {
2565  uri->host_start = uri->canon_len;
2566 
2567  if(data->scheme_type == URL_SCHEME_UNKNOWN) {
2568  if(!computeOnly)
2569  memcpy(uri->canon_uri+uri->canon_len, data->host, data->host_len*sizeof(WCHAR));
2570  uri->canon_len += data->host_len;
2571  } else {
2572  USHORT values[8];
2573  INT elision_start;
2574  DWORD i, elision_len;
2575 
2576  if(!ipv6_to_number(&(data->ipv6_address), values)) {
2577  TRACE("(%p %p %x %d): Failed to compute numerical value for IPv6 address.\n",
2578  data, uri, flags, computeOnly);
2579  return FALSE;
2580  }
2581 
2582  if(!computeOnly)
2583  uri->canon_uri[uri->canon_len] = '[';
2584  ++uri->canon_len;
2585 
2586  /* Find where the elision should occur (if any). */
2587  compute_elision_location(&(data->ipv6_address), values, &elision_start, &elision_len);
2588 
2589  TRACE("%p %p %x %d): Elision starts at %d, len=%u\n", data, uri, flags,
2590  computeOnly, elision_start, elision_len);
2591 
2592  for(i = 0; i < 8; ++i) {
2593  BOOL in_elision = (elision_start > -1 && i >= elision_start &&
2594  i < elision_start+elision_len);
2595  BOOL do_ipv4 = (i == 6 && data->ipv6_address.ipv4 && !in_elision &&
2596  data->ipv6_address.h16_count == 0);
2597 
2598  if(i == elision_start) {
2599  if(!computeOnly) {
2600  uri->canon_uri[uri->canon_len] = ':';
2601  uri->canon_uri[uri->canon_len+1] = ':';
2602  }
2603  uri->canon_len += 2;
2604  }
2605 
2606  /* We can ignore the current component if we're in the elision. */
2607  if(in_elision)
2608  continue;
2609 
2610  /* We only add a ':' if we're not at i == 0, or when we're at
2611  * the very end of elision range since the ':' colon was handled
2612  * earlier. Otherwise we would end up with ":::" after elision.
2613  */
2614  if(i != 0 && !(elision_start > -1 && i == elision_start+elision_len)) {
2615  if(!computeOnly)
2616  uri->canon_uri[uri->canon_len] = ':';
2617  ++uri->canon_len;
2618  }
2619 
2620  if(do_ipv4) {
2621  UINT val;
2622  DWORD len;
2623 
2624  /* Combine the two parts of the IPv4 address values. */
2625  val = values[i];
2626  val <<= 16;
2627  val += values[i+1];
2628 
2629  if(!computeOnly)
2630  len = ui2ipv4(uri->canon_uri+uri->canon_len, val);
2631  else
2632  len = ui2ipv4(NULL, val);
2633 
2634  uri->canon_len += len;
2635  ++i;
2636  } else {
2637  /* Write a regular h16 component to the URI. */
2638 
2639  /* Short circuit for the trivial case. */
2640  if(values[i] == 0) {
2641  if(!computeOnly)
2642  uri->canon_uri[uri->canon_len] = '0';
2643  ++uri->canon_len;
2644  } else {
2645  static const WCHAR formatW[] = {'%','x',0};
2646 
2647  if(!computeOnly)
2648  uri->canon_len += sprintfW(uri->canon_uri+uri->canon_len,
2649  formatW, values[i]);
2650  else {
2651  WCHAR tmp[5];
2652  uri->canon_len += sprintfW(tmp, formatW, values[i]);
2653  }
2654  }
2655  }
2656  }
2657 
2658  /* Add the closing ']'. */
2659  if(!computeOnly)
2660  uri->canon_uri[uri->canon_len] = ']';
2661  ++uri->canon_len;
2662  }
2663 
2664  uri->host_len = uri->canon_len - uri->host_start;
2665 
2666  if(!computeOnly)
2667  TRACE("(%p %p %x %d): Canonicalized IPv6 address %s, len=%d\n", data, uri, flags,
2668  computeOnly, debugstr_wn(uri->canon_uri+uri->host_start, uri->host_len),
2669  uri->host_len);
2670 
2671  return TRUE;
2672 }
2673 
2674 /* Attempts to canonicalize the host of the URI (if any). */
2675 static BOOL canonicalize_host(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
2676  uri->host_start = -1;
2677  uri->host_len = 0;
2678  uri->domain_offset = -1;
2679 
2680  if(data->host) {
2681  switch(data->host_type) {
2682  case Uri_HOST_DNS:
2683  uri->host_type = Uri_HOST_DNS;
2684  if(!canonicalize_reg_name(data, uri, flags, computeOnly))
2685  return FALSE;
2686 
2687  break;
2688  case Uri_HOST_IPV4:
2689  uri->host_type = Uri_HOST_IPV4;
2690  if(!canonicalize_ipv4address(data, uri, flags, computeOnly))
2691  return FALSE;
2692 
2693  break;
2694  case Uri_HOST_IPV6:
2695  if(!canonicalize_ipv6address(data, uri, flags, computeOnly))
2696  return FALSE;
2697 
2698  uri->host_type = Uri_HOST_IPV6;
2699  break;
2700  case Uri_HOST_UNKNOWN:
2701  if(data->host_len > 0 || data->scheme_type != URL_SCHEME_FILE) {
2702  uri->host_start = uri->canon_len;
2703 
2704  /* Nothing happens to unknown host types. */
2705  if(!computeOnly)
2706  memcpy(uri->canon_uri+uri->canon_len, data->host, data->host_len*sizeof(WCHAR));
2707  uri->canon_len += data->host_len;
2708  uri->host_len = data->host_len;
2709  }
2710 
2711  uri->host_type = Uri_HOST_UNKNOWN;
2712  break;
2713  default:
2714  FIXME("(%p %p %x %d): Canonicalization for host type %d not supported.\n", data,
2715  uri, flags, computeOnly, data->host_type);
2716  return FALSE;
2717  }
2718  }
2719 
2720  return TRUE;
2721 }
2722 
2723 static BOOL canonicalize_port(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
2724  BOOL has_default_port = FALSE;
2725  USHORT default_port = 0;
2726  DWORD i;
2727 
2728  uri->port_offset = -1;
2729 
2730  /* Check if the scheme has a default port. */
2731  for(i = 0; i < ARRAY_SIZE(default_ports); ++i) {
2732  if(default_ports[i].scheme == data->scheme_type) {
2733  has_default_port = TRUE;
2734  default_port = default_ports[i].port;
2735  break;
2736  }
2737  }
2738 
2739  uri->has_port = data->has_port || has_default_port;
2740 
2741  /* Possible cases:
2742  * 1) Has a port which is the default port.
2743  * 2) Has a port (not the default).
2744  * 3) Doesn't have a port, but, scheme has a default port.
2745  * 4) No port.
2746  */
2747  if(has_default_port && data->has_port && data->port_value == default_port) {
2748  /* If it's the default port and this flag isn't set, don't do anything. */
2749  if(flags & Uri_CREATE_NO_CANONICALIZE) {
2750  uri->port_offset = uri->canon_len-uri->authority_start;
2751  if(!computeOnly)
2752  uri->canon_uri[uri->canon_len] = ':';
2753  ++uri->canon_len;
2754 
2755  if(data->port) {
2756  /* Copy the original port over. */
2757  if(!computeOnly)
2758  memcpy(uri->canon_uri+uri->canon_len, data->port, data->port_len*sizeof(WCHAR));
2759  uri->canon_len += data->port_len;
2760  } else {
2761  if(!computeOnly)
2762  uri->canon_len += ui2str(uri->canon_uri+uri->canon_len, data->port_value);
2763  else
2764  uri->canon_len += ui2str(NULL, data->port_value);
2765  }
2766  }
2767 
2768  uri->port = default_port;
2769  } else if(data->has_port) {
2770  uri->port_offset = uri->canon_len-uri->authority_start;
2771  if(!computeOnly)
2772  uri->canon_uri[uri->canon_len] = ':';
2773  ++uri->canon_len;
2774 
2775  if(flags & Uri_CREATE_NO_CANONICALIZE && data->port) {
2776  /* Copy the original over without changes. */
2777  if(!computeOnly)
2778  memcpy(uri->canon_uri+uri->canon_len, data->port, data->port_len*sizeof(WCHAR));
2779  uri->canon_len += data->port_len;
2780  } else {
2781  if(!computeOnly)
2782  uri->canon_len += ui2str(uri->canon_uri+uri->canon_len, data->port_value);
2783  else
2784  uri->canon_len += ui2str(NULL, data->port_value);
2785  }
2786 
2787  uri->port = data->port_value;
2788  } else if(has_default_port)
2789  uri->port = default_port;
2790 
2791  return TRUE;
2792 }
2793 
2794 /* Canonicalizes the authority of the URI represented by the parse_data. */
2796  uri->authority_start = uri->canon_len;
2797  uri->authority_len = 0;
2798 
2799  if(!canonicalize_userinfo(data, uri, flags, computeOnly))
2800  return FALSE;
2801 
2802  if(!canonicalize_host(data, uri, flags, computeOnly))
2803  return FALSE;
2804 
2805  if(!canonicalize_port(data, uri, flags, computeOnly))
2806  return FALSE;
2807 
2808  if(uri->host_start != -1 || (data->is_relative && (data->password || data->username)))
2809  uri->authority_len = uri->canon_len - uri->authority_start;
2810  else
2811  uri->authority_start = -1;
2812 
2813  return TRUE;
2814 }
2815 
2816 /* Attempts to canonicalize the path of a hierarchical URI.
2817  *
2818  * Things that happen:
2819  * 1). Forbidden characters are percent encoded, unless the NO_ENCODE_FORBIDDEN
2820  * flag is set or it's a file URI. Forbidden characters are always encoded
2821  * for file schemes regardless and forbidden characters are never encoded
2822  * for unknown scheme types.
2823  *
2824  * 2). For known scheme types '\\' are changed to '/'.
2825  *
2826  * 3). Percent encoded, unreserved characters are decoded to their actual values.
2827  * Unless the scheme type is unknown. For file schemes any percent encoded
2828  * character in the unreserved or reserved set is decoded.
2829  *
2830  * 4). For File schemes if the path is starts with a drive letter and doesn't
2831  * start with a '/' then one is appended.
2832  * Ex: file://c:/test.mp3 -> file:///c:/test.mp3
2833  *
2834  * 5). Dot segments are removed from the path for all scheme types
2835  * unless NO_CANONICALIZE flag is set. Dot segments aren't removed
2836  * for wildcard scheme types.
2837  *
2838  * NOTES:
2839  * file://c:/test%20test -> file:///c:/test%2520test
2840  * file://c:/test%3Etest -> file:///c:/test%253Etest
2841  * if Uri_CREATE_FILE_USE_DOS_PATH is not set:
2842  * file:///c:/test%20test -> file:///c:/test%20test
2843  * file:///c:/test%test -> file:///c:/test%25test
2844  */
2846  BOOL is_implicit_scheme, WCHAR *ret_path) {
2847  const BOOL known_scheme = scheme_type != URL_SCHEME_UNKNOWN;
2848  const BOOL is_file = scheme_type == URL_SCHEME_FILE;
2849  const BOOL is_res = scheme_type == URL_SCHEME_RES;
2850  const WCHAR *ptr;
2851  BOOL escape_pct = FALSE;
2852  DWORD len = 0;
2853 
2854  if(!path)
2855  return 0;
2856 
2857  ptr = path;
2858 
2859  if(is_file && !has_host) {
2860  /* Check if a '/' needs to be appended for the file scheme. */
2861  if(path_len > 1 && is_drive_path(ptr) && !(flags & Uri_CREATE_FILE_USE_DOS_PATH)) {
2862  if(ret_path)
2863  ret_path[len] = '/';
2864  len++;
2865  escape_pct = TRUE;
2866  } else if(*ptr == '/') {
2867  if(!(flags & Uri_CREATE_FILE_USE_DOS_PATH)) {
2868  /* Copy the extra '/' over. */
2869  if(ret_path)
2870  ret_path[len] = '/';
2871  len++;
2872  }
2873  ++ptr;
2874  }
2875 
2876  if(is_drive_path(ptr)) {
2877  if(ret_path) {
2878  ret_path[len] = *ptr;
2879  /* If there's a '|' after the drive letter, convert it to a ':'. */
2880  ret_path[len+1] = ':';
2881  }
2882  ptr += 2;
2883  len += 2;
2884  }
2885  }
2886 
2887  if(!is_file && *path && *path != '/') {
2888  /* Prepend a '/' to the path if it doesn't have one. */
2889  if(ret_path)
2890  ret_path[len] = '/';
2891  len++;
2892  }
2893 
2894  for(; ptr < path+path_len; ++ptr) {
2895  BOOL do_default_action = TRUE;
2896 
2897  if(*ptr == '%' && !is_res) {
2898  const WCHAR *tmp = ptr;
2899  WCHAR val;
2900 
2901  /* Check if the % represents a valid encoded char, or if it needs encoding. */
2902  BOOL force_encode = !check_pct_encoded(&tmp) && is_file && !(flags&Uri_CREATE_FILE_USE_DOS_PATH);
2903  val = decode_pct_val(ptr);
2904 
2905  if(force_encode || escape_pct) {
2906  /* Escape the percent sign in the file URI. */
2907  if(ret_path)
2908  pct_encode_val(*ptr, ret_path+len);
2909  len += 3;
2910  do_default_action = FALSE;
2911  } else if((is_unreserved(val) && known_scheme) ||
2912  (is_file && !is_implicit_scheme && (is_unreserved(val) || is_reserved(val) ||
2913  (val && flags&Uri_CREATE_FILE_USE_DOS_PATH && !is_forbidden_dos_path_char(val))))) {
2914  if(ret_path)
2915  ret_path[len] = val;
2916  len++;
2917 
2918  ptr += 2;
2919  continue;
2920  }
2921  } else if(*ptr == '/' && is_file && (flags & Uri_CREATE_FILE_USE_DOS_PATH)) {
2922  /* Convert the '/' back to a '\\'. */
2923  if(ret_path)
2924  ret_path[len] = '\\';
2925  len++;
2926  do_default_action = FALSE;
2927  } else if(*ptr == '\\' && known_scheme) {
2928  if(!(is_file && (flags & Uri_CREATE_FILE_USE_DOS_PATH))) {
2929  /* Convert '\\' into a '/'. */
2930  if(ret_path)
2931  ret_path[len] = '/';
2932  len++;
2933  do_default_action = FALSE;
2934  }
2935  } else if(known_scheme && !is_res && is_ascii(*ptr) && !is_unreserved(*ptr) && !is_reserved(*ptr) &&
2936  (!(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS) || is_file)) {
2937  if(!is_file || !(flags & Uri_CREATE_FILE_USE_DOS_PATH)) {
2938  /* Escape the forbidden character. */
2939  if(ret_path)
2940  pct_encode_val(*ptr, ret_path+len);
2941  len += 3;
2942  do_default_action = FALSE;
2943  }
2944  }
2945 
2946  if(do_default_action) {
2947  if(ret_path)
2948  ret_path[len] = *ptr;
2949  len++;
2950  }
2951  }
2952 
2953  /* Removing the dot segments only happens when it's not in
2954  * computeOnly mode and it's not a wildcard scheme. File schemes
2955  * with USE_DOS_PATH set don't get dot segments removed.
2956  */
2957  if(!(is_file && (flags & Uri_CREATE_FILE_USE_DOS_PATH)) &&
2958  scheme_type != URL_SCHEME_WILDCARD) {
2959  if(!(flags & Uri_CREATE_NO_CANONICALIZE) && ret_path) {
2960  /* Remove the dot segments (if any) and reset everything to the new
2961  * correct length.
2962  */
2963  len = remove_dot_segments(ret_path, len);
2964  }
2965  }
2966 
2967  if(ret_path)
2968  TRACE("Canonicalized path %s len=%d\n", debugstr_wn(ret_path, len), len);
2969  return len;
2970 }
2971 
2972 /* Attempts to canonicalize the path for an opaque URI.
2973  *
2974  * For known scheme types:
2975  * 1) forbidden characters are percent encoded if
2976  * NO_ENCODE_FORBIDDEN_CHARACTERS isn't set.
2977  *
2978  * 2) Percent encoded, unreserved characters are decoded
2979  * to their actual values, for known scheme types.
2980  *
2981  * 3) '\\' are changed to '/' for known scheme types
2982  * except for mailto schemes.
2983  *
2984  * 4) For file schemes, if USE_DOS_PATH is set all '/'
2985  * are converted to backslashes.
2986  *
2987  * 5) For file schemes, if USE_DOS_PATH isn't set all '\'
2988  * are converted to forward slashes.
2989  */
2991  const WCHAR *ptr;
2992  const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
2993  const BOOL is_file = data->scheme_type == URL_SCHEME_FILE;
2994  const BOOL is_mk = data->scheme_type == URL_SCHEME_MK;
2995 
2996  if(!data->path) {
2997  uri->path_start = -1;
2998  uri->path_len = 0;
2999  return TRUE;
3000  }
3001 
3002  uri->path_start = uri->canon_len;
3003 
3004  if(is_mk){
3005  /* hijack this flag for SCHEME_MK to tell the function when to start
3006  * converting slashes */
3007  flags |= Uri_CREATE_FILE_USE_DOS_PATH;
3008  }
3009 
3010  /* For javascript: URIs, simply copy path part without any canonicalization */
3011  if(data->scheme_type == URL_SCHEME_JAVASCRIPT) {
3012  if(!computeOnly)
3013  memcpy(uri->canon_uri+uri->canon_len, data->path, data->path_len*sizeof(WCHAR));
3014  uri->path_len = data->path_len;
3015  uri->canon_len += data->path_len;
3016  return TRUE;
3017  }
3018 
3019  /* Windows doesn't allow a "//" to appear after the scheme
3020  * of a URI, if it's an opaque URI.
3021  */
3022  if(data->scheme && *(data->path) == '/' && *(data->path+1) == '/') {
3023  /* So it inserts a "/." before the "//" if it exists. */
3024  if(!computeOnly) {
3025  uri->canon_uri[uri->canon_len] = '/';
3026  uri->canon_uri[uri->canon_len+1] = '.';
3027  }
3028 
3029  uri->canon_len += 2;
3030  }
3031 
3032  for(ptr = data->path; ptr < data->path+data->path_len; ++ptr) {
3033  BOOL do_default_action = TRUE;
3034 
3035  if(*ptr == '%' && known_scheme) {
3037 
3038  if(is_unreserved(val)) {
3039  if(!computeOnly)
3040  uri->canon_uri[uri->canon_len] = val;
3041  ++uri->canon_len;
3042 
3043  ptr += 2;
3044  continue;
3045  }
3046  } else if(*ptr == '/' && is_file && (flags & Uri_CREATE_FILE_USE_DOS_PATH)) {
3047  if(!computeOnly)
3048  uri->canon_uri[uri->canon_len] = '\\';
3049  ++uri->canon_len;
3050  do_default_action = FALSE;
3051  } else if(*ptr == '\\') {
3052  if((data->is_relative || is_mk || is_file) && !(flags & Uri_CREATE_FILE_USE_DOS_PATH)) {
3053  /* Convert to a '/'. */
3054  if(!computeOnly)
3055  uri->canon_uri[uri->canon_len] = '/';
3056  ++uri->canon_len;
3057  do_default_action = FALSE;
3058  }
3059  } else if(is_mk && *ptr == ':' && ptr + 1 < data->path + data->path_len && *(ptr + 1) == ':') {
3060  flags &= ~Uri_CREATE_FILE_USE_DOS_PATH;
3061  } else if(known_scheme && is_ascii(*ptr) && !is_unreserved(*ptr) && !is_reserved(*ptr) &&
3062  !(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS)) {
3063  if(!(is_file && (flags & Uri_CREATE_FILE_USE_DOS_PATH))) {
3064  if(!computeOnly)
3065  pct_encode_val(*ptr, uri->canon_uri+uri->canon_len);
3066  uri->canon_len += 3;
3067  do_default_action = FALSE;
3068  }
3069  }
3070 
3071  if(do_default_action) {
3072  if(!computeOnly)
3073  uri->canon_uri[uri->canon_len] = *ptr;
3074  ++uri->canon_len;
3075  }
3076  }
3077 
3078  if(is_mk && !computeOnly && !(flags & Uri_CREATE_NO_CANONICALIZE)) {
3079  DWORD new_len = remove_dot_segments(uri->canon_uri + uri->path_start,
3080  uri->canon_len - uri->path_start);
3081  uri->canon_len = uri->path_start + new_len;
3082  }
3083 
3084  uri->path_len = uri->canon_len - uri->path_start;
3085 
3086  if(!computeOnly)
3087  TRACE("(%p %p %x %d): Canonicalized opaque URI path %s len=%d\n", data, uri, flags, computeOnly,
3088  debugstr_wn(uri->canon_uri+uri->path_start, uri->path_len), uri->path_len);
3089  return TRUE;
3090 }
3091 
3092 /* Determines how the URI represented by the parse_data should be canonicalized.
3093  *
3094  * Essentially, if the parse_data represents an hierarchical URI then it calls
3095  * canonicalize_authority and the canonicalization functions for the path. If the
3096  * URI is opaque it canonicalizes the path of the URI.
3097  */
3098 static BOOL canonicalize_hierpart(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
3099  if(!data->is_opaque || (data->is_relative && (data->password || data->username))) {
3100  /* "//" is only added for non-wildcard scheme types.
3101  *
3102  * A "//" is only added to a relative URI if it has a
3103  * host or port component (this only happens if a IUriBuilder
3104  * is generating an IUri).
3105  */
3106  if((data->is_relative && (data->host || data->has_port)) ||
3107  (!data->is_relative && data->scheme_type != URL_SCHEME_WILDCARD)) {
3108  if(data->scheme_type == URL_SCHEME_WILDCARD)
3109  FIXME("Here\n");
3110 
3111  if(!computeOnly) {
3112  INT pos = uri->canon_len;
3113 
3114  uri->canon_uri[pos] = '/';
3115  uri->canon_uri[pos+1] = '/';
3116  }
3117  uri->canon_len += 2;
3118  }
3119 
3120  if(!canonicalize_authority(data, uri, flags, computeOnly))
3121  return FALSE;
3122 
3123  if(data->is_relative && (data->password || data->username)) {
3124  if(!canonicalize_path_opaque(data, uri, flags, computeOnly))
3125  return FALSE;
3126  } else {
3127  if(!computeOnly)
3128  uri->path_start = uri->canon_len;
3129  uri->path_len = canonicalize_path_hierarchical(data->path, data->path_len, data->scheme_type, data->host_len != 0,
3130  flags, data->has_implicit_scheme, computeOnly ? NULL : uri->canon_uri+uri->canon_len);
3131  uri->canon_len += uri->path_len;
3132  if(!computeOnly && !uri->path_len)
3133  uri->path_start = -1;
3134  }
3135  } else {
3136  /* Opaque URI's don't have an authority. */
3137  uri->userinfo_start = uri->userinfo_split = -1;
3138  uri->userinfo_len = 0;
3139  uri->host_start = -1;
3140  uri->host_len = 0;
3141  uri->host_type = Uri_HOST_UNKNOWN;
3142  uri->has_port = FALSE;
3143  uri->authority_start = -1;
3144  uri->authority_len = 0;
3145  uri->domain_offset = -1;
3146  uri->port_offset = -1;
3147 
3148  if(is_hierarchical_scheme(data->scheme_type)) {
3149  DWORD i;
3150 
3151  /* Absolute URIs aren't displayed for known scheme types
3152  * which should be hierarchical URIs.
3153  */
3154  uri->display_modifiers |= URI_DISPLAY_NO_ABSOLUTE_URI;
3155 
3156  /* Windows also sets the port for these (if they have one). */
3157  for(i = 0; i < ARRAY_SIZE(default_ports); ++i) {
3158  if(data->scheme_type == default_ports[i].scheme) {
3159  uri->has_port = TRUE;
3160  uri->port = default_ports[i].port;
3161  break;
3162  }
3163  }
3164  }
3165 
3166  if(!canonicalize_path_opaque(data, uri, flags, computeOnly))
3167  return FALSE;
3168  }
3169 
3170  if(uri->path_start > -1 && !computeOnly)
3171  /* Finding file extensions happens for both types of URIs. */
3172  uri->extension_offset = find_file_extension(uri->canon_uri+uri->path_start, uri->path_len);
3173  else
3174  uri->extension_offset = -1;
3175 
3176  return TRUE;
3177 }
3178 
3179 /* Attempts to canonicalize the query string of the URI.
3180  *
3181  * Things that happen:
3182  * 1) For known scheme types forbidden characters
3183  * are percent encoded, unless the NO_DECODE_EXTRA_INFO flag is set
3184  * or NO_ENCODE_FORBIDDEN_CHARACTERS is set.
3185  *
3186  * 2) For known scheme types, percent encoded, unreserved characters
3187  * are decoded as long as the NO_DECODE_EXTRA_INFO flag isn't set.
3188  */
3189 static BOOL canonicalize_query(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
3190  const WCHAR *ptr, *end;
3191  const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
3192 
3193  if(!data->query) {
3194  uri->query_start = -1;
3195  uri->query_len = 0;
3196  return TRUE;
3197  }
3198 
3199  uri->query_start = uri->canon_len;
3200 
3201  end = data->query+data->query_len;
3202  for(ptr = data->query; ptr < end; ++ptr) {
3203  if(*ptr == '%') {
3204  if(known_scheme && !(flags & Uri_CREATE_NO_DECODE_EXTRA_INFO)) {
3206  if(is_unreserved(val)) {
3207  if(!computeOnly)
3208  uri->canon_uri[uri->canon_len] = val;
3209  ++uri->canon_len;
3210 
3211  ptr += 2;
3212  continue;
3213  }
3214  }
3215  } else if(known_scheme && is_ascii(*ptr) && !is_unreserved(*ptr) && !is_reserved(*ptr)) {
3216  if(!(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS) &&
3217  !(flags & Uri_CREATE_NO_DECODE_EXTRA_INFO)) {
3218  if(!computeOnly)
3219  pct_encode_val(*ptr, uri->canon_uri+uri->canon_len);
3220  uri->canon_len += 3;
3221  continue;
3222  }
3223  }
3224 
3225  if(!computeOnly)
3226  uri->canon_uri[uri->canon_len] = *ptr;
3227  ++uri->canon_len;
3228  }
3229 
3230  uri->query_len = uri->canon_len - uri->query_start;
3231 
3232  if(!computeOnly)
3233  TRACE("(%p %p %x %d): Canonicalized query string %s len=%d\n", data, uri, flags,
3234  computeOnly, debugstr_wn(uri->canon_uri+uri->query_start, uri->query_len),
3235  uri->query_len);
3236  return TRUE;
3237 }
3238 
3239 static BOOL canonicalize_fragment(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
3240  const WCHAR *ptr, *end;
3241  const BOOL known_scheme = data->scheme_type != URL_SCHEME_UNKNOWN;
3242 
3243  if(!data->fragment) {
3244  uri->fragment_start = -1;
3245  uri->fragment_len = 0;
3246  return TRUE;
3247  }
3248 
3249  uri->fragment_start = uri->canon_len;
3250 
3251  end = data->fragment + data->fragment_len;
3252  for(ptr = data->fragment; ptr < end; ++ptr) {
3253  if(*ptr == '%') {
3254  if(known_scheme && !(flags & Uri_CREATE_NO_DECODE_EXTRA_INFO)) {
3256  if(is_unreserved(val)) {
3257  if(!computeOnly)
3258  uri->canon_uri[uri->canon_len] = val;
3259  ++uri->canon_len;
3260 
3261  ptr += 2;
3262  continue;
3263  }
3264  }
3265  } else if(known_scheme && is_ascii(*ptr) && !is_unreserved(*ptr) && !is_reserved(*ptr)) {
3266  if(!(flags & Uri_CREATE_NO_ENCODE_FORBIDDEN_CHARACTERS) &&
3267  !(flags & Uri_CREATE_NO_DECODE_EXTRA_INFO)) {
3268  if(!computeOnly)
3269  pct_encode_val(*ptr, uri->canon_uri+uri->canon_len);
3270  uri->canon_len += 3;
3271  continue;
3272  }
3273  }
3274 
3275  if(!computeOnly)
3276  uri->canon_uri[uri->canon_len] = *ptr;
3277  ++uri->canon_len;
3278  }
3279 
3280  uri->fragment_len = uri->canon_len - uri->fragment_start;
3281 
3282  if(!computeOnly)
3283  TRACE("(%p %p %x %d): Canonicalized fragment %s len=%d\n", data, uri, flags,
3284  computeOnly, debugstr_wn(uri->canon_uri+uri->fragment_start, uri->fragment_len),
3285  uri->fragment_len);
3286  return TRUE;
3287 }
3288 
3289 /* Canonicalizes the scheme information specified in the parse_data using the specified flags. */
3290 static BOOL canonicalize_scheme(const parse_data *data, Uri *uri, DWORD flags, BOOL computeOnly) {
3291  uri->scheme_start = -1;
3292  uri->scheme_len = 0;
3293 
3294  if(!data->scheme) {
3295  /* The only type of URI that doesn't have to have a scheme is a relative
3296  * URI.
3297  */
3298  if(!data->is_relative) {
3299  FIXME("(%p %p %x): Unable to determine the scheme type of %s.\n", data,
3300  uri, flags, debugstr_w(data->uri));
3301  return FALSE;
3302  }
3303  } else {
3304  if(!computeOnly) {
3305  DWORD i;
3306  INT pos = uri->canon_len;
3307 
3308  for(i = 0; i < data->scheme_len; ++i) {
3309  /* Scheme name must be lower case after canonicalization. */
3310  uri->canon_uri[i + pos] = tolowerW(data->scheme[i]);
3311  }
3312 
3313  uri->canon_uri[i + pos] = ':';
3314  uri->scheme_start = pos;
3315 
3316  TRACE("(%p %p %x): Canonicalized scheme=%s, len=%d.\n", data, uri, flags,
3317  debugstr_wn(uri->canon_uri+uri->scheme_start, data->scheme_len), data->scheme_len);
3318  }
3319 
3320  /* This happens in both computation modes. */
3321  uri->canon_len += data->scheme_len + 1;
3322  uri->scheme_len = data->scheme_len;
3323  }
3324  return TRUE;
3325 }
3326 
3327 /* Computes what the length of the URI specified by the parse_data will be
3328  * after canonicalization occurs using the specified flags.
3329  *
3330  * This function will return a non-zero value indicating the length of the canonicalized
3331  * URI, or -1 on error.
3332  */
3334  Uri uri;
3335 
3336  memset(&uri, 0, sizeof(Uri));
3337 
3338  TRACE("(%p %x): Beginning to compute canonicalized length for URI %s\n", data, flags,
3339  debugstr_w(data->uri));
3340 
3341  if(!canonicalize_scheme(data, &uri, flags, TRUE)) {
3342  ERR("(%p %x): Failed to compute URI scheme length.\n", data, flags);
3343  return -1;
3344  }
3345 
3347  ERR("(%p %x): Failed to compute URI hierpart length.\n", data, flags);
3348  return -1;
3349  }
3350 
3351  if(!canonicalize_query(data, &uri, flags, TRUE)) {
3352  ERR("(%p %x): Failed to compute query string length.\n", data, flags);
3353  return -1;
3354  }
3355 
3357  ERR("(%p %x): Failed to compute fragment length.\n", data, flags);
3358  return -1;
3359  }
3360 
3361  TRACE("(%p %x): Finished computing canonicalized URI length. length=%d\n", data, flags, uri.canon_len);
3362 
3363  return uri.canon_len;
3364 }
3365 
3366 /* Canonicalizes the URI data specified in the parse_data, using the given flags. If the
3367  * canonicalization succeeds it will store all the canonicalization information
3368  * in the pointer to the Uri.
3369  *
3370  * To canonicalize a URI this function first computes what the length of the URI
3371  * specified by the parse_data will be. Once this is done it will then perform the actual
3372  * canonicalization of the URI.
3373  */
3375  INT len;
3376 
3377  uri->canon_uri = NULL;
3378  uri->canon_size = uri->canon_len = 0;
3379 
3380  TRACE("(%p %p %x): beginning to canonicalize URI %s.\n", data, uri, flags, debugstr_w(data->uri));
3381 
3382  /* First try to compute the length of the URI. */
3384  if(len == -1) {
3385  ERR("(%p %p %x): Could not compute the canonicalized length of %s.\n", data, uri, flags,
3386  debugstr_w(data->uri));
3387  return E_INVALIDARG;
3388  }
3389 
3390  uri->canon_uri = heap_alloc((len+1)*sizeof(WCHAR));
3391  if(!uri->canon_uri)
3392  return E_OUTOFMEMORY;
3393 
3394  uri->canon_size = len;
3396  ERR("(%p %p %x): Unable to canonicalize the scheme of the URI.\n", data, uri, flags);
3397  return E_INVALIDARG;
3398  }
3399  uri->scheme_type = data->scheme_type;
3400 
3402  ERR("(%p %p %x): Unable to canonicalize the heirpart of the URI\n", data, uri, flags);
3403  return E_INVALIDARG;
3404  }
3405 
3407  ERR("(%p %p %x): Unable to canonicalize query string of the URI.\n",
3408  data, uri, flags);
3409  return E_INVALIDARG;
3410  }
3411 
3413  ERR("(%p %p %x): Unable to canonicalize fragment of the URI.\n",
3414  data, uri, flags);
3415  return E_INVALIDARG;
3416  }
3417 
3418  /* There's a possibility we didn't use all the space we allocated
3419  * earlier.
3420  */
3421  if(uri->canon_len < uri->canon_size) {
3422  /* This happens if the URI is hierarchical and dot
3423  * segments were removed from its path.
3424  */
3425  WCHAR *tmp = heap_realloc(uri->canon_uri, (uri->canon_len+1)*sizeof(WCHAR));
3426  if(!tmp)
3427  return E_OUTOFMEMORY;
3428 
3429  uri->canon_uri = tmp;
3430  uri->canon_size = uri->canon_len;
3431  }
3432 
3433  uri->canon_uri[uri->canon_len] = '\0';
3434  TRACE("(%p %p %x): finished canonicalizing the URI. uri=%s\n", data, uri, flags, debugstr_w(uri->canon_uri));
3435 
3436  return S_OK;
3437 }
3438 
3439 static HRESULT get_builder_component(LPWSTR *component, DWORD *component_len,
3440  LPCWSTR source, DWORD source_len,
3441  LPCWSTR *output, DWORD *output_len)
3442 {
3443  if(!output_len) {
3444  if(output)
3445  *output = NULL;
3446  return E_POINTER;
3447  }
3448 
3449  if(!output) {
3450  *output_len = 0;
3451  return E_POINTER;
3452  }
3453 
3454  if(!(*component) && source) {
3455  /* Allocate 'component', and copy the contents from 'source'
3456  * into the new allocation.
3457  */
3458  *component = heap_alloc((source_len+1)*sizeof(WCHAR));
3459  if(!(*component))
3460  return E_OUTOFMEMORY;
3461 
3462  memcpy(*component, source, source_len*sizeof(WCHAR));
3463  (*component)[source_len] = '\0';
3464  *component_len = source_len;
3465  }
3466 
3467  *output = *component;
3468  *output_len = *component_len;
3469  return *output ? S_OK : S_FALSE;
3470 }
3471 
3472 /* Allocates 'component' and copies the string from 'new_value' into 'component'.
3473  * If 'prefix' is set and 'new_value' isn't NULL, then it checks if 'new_value'
3474  * starts with 'prefix'. If it doesn't then 'prefix' is prepended to 'component'.
3475  *
3476  * If everything is successful, then will set 'success_flag' in 'flags'.
3477  */
3478 static HRESULT set_builder_component(LPWSTR *component, DWORD *component_len, LPCWSTR new_value,
3479  WCHAR prefix, DWORD *flags, DWORD success_flag)
3480 {
3481  heap_free(*component);
3482 
3483  if(!new_value) {
3484  *component = NULL;
3485  *component_len = 0;
3486  } else {
3487  BOOL add_prefix = FALSE;
3488  DWORD len = lstrlenW(new_value);
3489  DWORD pos = 0;
3490 
3491  if(prefix && *new_value != prefix) {
3492  add_prefix = TRUE;
3493  *component = heap_alloc((len+2)*sizeof(WCHAR));
3494  } else
3495  *component = heap_alloc((len+1)*sizeof(WCHAR));
3496 
3497  if(!(*component))
3498  return E_OUTOFMEMORY;
3499 
3500  if(add_prefix)
3501  (*component)[pos++] = prefix;
3502 
3503  memcpy(*component+pos, new_value, (len+1)*sizeof(WCHAR));
3504  *component_len = len+pos;
3505  }
3506 
3507  *flags |= success_flag;
3508  return S_OK;
3509 }
3510 
3511 static void reset_builder(UriBuilder *builder) {
3512  if(builder->uri)
3513  IUri_Release(&builder->uri->IUri_iface);
3514  builder->uri = NULL;
3515 
3516  heap_free(builder->fragment);
3517  builder->fragment = NULL;
3518  builder->fragment_len = 0;
3519 
3520  heap_free(builder->host);
3521  builder->host = NULL;
3522  builder->host_len = 0;
3523 
3524  heap_free(builder->password);
3525  builder->password = NULL;
3526  builder->password_len = 0;
3527 
3528  heap_free(builder->path);
3529  builder->path = NULL;
3530  builder->path_len = 0;
3531 
3532  heap_free(builder->query);
3533  builder->query = NULL;
3534  builder->query_len = 0;
3535 
3536  heap_free(builder->scheme);
3537  builder->scheme = NULL;
3538  builder->scheme_len = 0;
3539 
3540  heap_free(builder->username);
3541  builder->username = NULL;
3542  builder->username_len = 0;
3543 
3544  builder->has_port = FALSE;
3545  builder->port = 0;
3546  builder->modified_props = 0;
3547 }
3548 
3550  const WCHAR *component;
3551  const WCHAR *ptr;
3552  const WCHAR **pptr;
3553  DWORD expected_len;
3554 
3555  if(builder->scheme) {
3556  ptr = builder->scheme;
3557  expected_len = builder->scheme_len;
3558  } else if(builder->uri && builder->uri->scheme_start > -1) {
3559  ptr = builder->uri->canon_uri+builder->uri->scheme_start;
3560  expected_len = builder->uri->scheme_len;
3561  } else {
3562  static const WCHAR nullW[] = {0};
3563  ptr = nullW;
3564  expected_len = 0;
3565  }
3566 
3567  component = ptr;
3568  pptr = &ptr;
3570  data->scheme_len == expected_len) {
3571  if(data->scheme)
3572  TRACE("(%p %p %x): Found valid scheme component %s len=%d.\n", builder, data, flags,
3573  debugstr_wn(data->scheme, data->scheme_len), data->scheme_len);
3574  } else {
3575  TRACE("(%p %p %x): Invalid scheme component found %s.\n", builder, data, flags,
3576  debugstr_wn(component, expected_len));
3577  return INET_E_INVALID_URL;
3578  }
3579 
3580  return S_OK;
3581 }
3582 
3584  const WCHAR *ptr;
3585  const WCHAR **pptr;
3586  DWORD expected_len;
3587 
3588  if(builder->username) {
3589  ptr = builder->username;
3590  expected_len = builder->username_len;
3591  } else if(!(builder->modified_props & Uri_HAS_USER_NAME) && builder->uri &&
3592  builder->uri->userinfo_start > -1 && builder->uri->userinfo_split != 0) {
3593  /* Just use the username from the base Uri. */
3594  data->username = builder->uri->canon_uri+builder->uri->userinfo_start;
3595  data->username_len = (builder->uri->userinfo_split > -1) ?
3596  builder->uri->userinfo_split : builder->uri->userinfo_len;
3597  ptr = NULL;
3598  } else {
3599  ptr = NULL;
3600  expected_len = 0;
3601  }
3602 
3603  if(ptr) {
3604  const WCHAR *component = ptr;
3605  pptr = &ptr;
3607  data->username_len == expected_len)
3608  TRACE("(%p %p %x): Found valid username component %s len=%d.\n", builder, data, flags,
3609  debugstr_wn(data->username, data->username_len), data->username_len);
3610  else {
3611  TRACE("(%p %p %x): Invalid username component found %s.\n", builder, data, flags,
3612  debugstr_wn(component, expected_len));
3613  return INET_E_INVALID_URL;
3614  }
3615  }
3616 
3617  return S_OK;
3618 }
3619 
3621  const WCHAR *ptr;
3622  const WCHAR **pptr;
3623  DWORD expected_len;
3624 
3625  if(builder->password) {
3626  ptr = builder->password;
3627  expected_len = builder->password_len;
3628  } else if(!(builder->modified_props & Uri_HAS_PASSWORD) && builder->uri &&
3629  builder->uri->userinfo_split > -1) {
3630  data->password = builder->uri->canon_uri+builder->uri->userinfo_start+builder->uri->userinfo_split+1;
3631  data->password_len = builder->uri->userinfo_len-builder->uri->userinfo_split-1;
3632  ptr = NULL;
3633  } else {
3634  ptr = NULL;
3635  expected_len = 0;
3636  }
3637 
3638  if(ptr) {
3639  const WCHAR *component = ptr;
3640  pptr = &ptr;
3642  data->password_len == expected_len)
3643  TRACE("(%p %p %x): Found valid password component %s len=%d.\n", builder, data, flags,
3644  debugstr_wn(data->password, data->password_len), data->password_len);
3645  else {
3646  TRACE("(%p %p %x): Invalid password component found %s.\n", builder, data, flags,
3647  debugstr_wn(component, expected_len));
3648  return INET_E_INVALID_URL;
3649  }
3650  }
3651 
3652  return S_OK;
3653 }
3654 
3656  HRESULT hr;
3657 
3658  hr = validate_username(builder, data, flags);
3659  if(FAILED(hr))
3660  return hr;
3661 
3662  hr = validate_password(builder, data, flags);
3663  if(FAILED(hr))
3664  return hr;
3665 
3666  return S_OK;
3667 }
3668 
3670  const WCHAR *ptr;
3671  const WCHAR **pptr;
3672  DWORD expected_len;
3673 
3674  if(builder->host) {
3675  ptr = builder->host;
3676  expected_len = builder->host_len;
3677  } else if(!(builder->modified_props & Uri_HAS_HOST) && builder->uri && builder->uri->host_start > -1) {
3678  ptr = builder->uri->canon_uri + builder->uri->host_start;
3679  expected_len = builder->uri->host_len;
3680  } else
3681  ptr = NULL;
3682 
3683  if(ptr) {
3684  const WCHAR *component = ptr;
3686  pptr = &ptr;
3687 
3688  if(parse_host(pptr, data, flags, extras) && data->host_len == expected_len)
3689  TRACE("(%p %p %x): Found valid host name %s len=%d type=%d.\n", builder, data, flags,
3690  debugstr_wn(data->host, data->host_len), data->host_len, data->host_type);
3691  else {
3692  TRACE("(%p %p %x): Invalid host name found %s.\n", builder, data, flags,
3693  debugstr_wn(component, expected_len));
3694  return INET_E_INVALID_URL;
3695  }
3696  }
3697 
3698  return S_OK;
3699 }
3700 
3701 static void setup_port(const UriBuilder *builder, parse_data *data, DWORD flags) {
3702  if(builder->modified_props & Uri_HAS_PORT) {
3703  if(builder->has_port) {
3704  data->has_port = TRUE;
3705  data->port_value = builder->port;
3706  }
3707  } else if(builder->uri && builder->uri->has_port) {
3708  data->has_port = TRUE;
3709  data->port_value = builder->uri->port;
3710  }
3711 
3712  if(data->has_port)
3713  TRACE("(%p %p %x): Using %u as port for IUri.\n", builder, data, flags, data->port_value);
3714 }
3715 
3717  const WCHAR *ptr = NULL;
3718  const WCHAR *component;
3719  const WCHAR **pptr;
3720  DWORD expected_len;
3721  BOOL check_len = TRUE;
3722  BOOL valid = FALSE;
3723 
3724  if(builder->path) {
3725  ptr = builder->path;
3726  expected_len = builder->path_len;
3727  } else if(!(builder->modified_props & Uri_HAS_PATH) &&
3728  builder->uri && builder->uri->path_start > -1) {
3729  ptr = builder->uri->canon_uri+builder->uri->path_start;
3730  expected_len = builder->uri->path_len;
3731  } else {
3732  static const WCHAR nullW[] = {0};
3733  ptr = nullW;
3734  check_len = FALSE;
3735  expected_len = -1;
3736  }
3737 
3738  component = ptr;
3739  pptr = &ptr;
3740 
3741  /* How the path is validated depends on what type of
3742  * URI it is.
3743  */
3744  valid = data->is_opaque ?
3746 
3747  if(!valid || (check_len && expected_len != data->path_len)) {
3748  TRACE("(%p %p %x): Invalid path component %s.\n", builder, data, flags,
3749  debugstr_wn(component, expected_len) );
3750  return INET_E_INVALID_URL;
3751  }
3752 
3753  TRACE("(%p %p %x): Valid path component %s len=%d.\n", builder, data, flags,
3754  debugstr_wn(data->path, data->path_len), data->path_len);
3755 
3756  return S_OK;
3757 }
3758 
3760  const WCHAR *ptr = NULL;
3761  const WCHAR **pptr;
3762  DWORD expected_len;
3763 
3764  if(builder->query) {
3765  ptr = builder->query;
3766  expected_len = builder->query_len;
3767  } else if(!(builder->modified_props & Uri_HAS_QUERY) && builder->uri &&
3768  builder->uri->query_start > -1) {
3769  ptr = builder->uri->canon_uri+builder->uri->query_start;
3770  expected_len = builder->uri->query_len;
3771  }
3772 
3773  if(ptr) {
3774  const WCHAR *component = ptr;
3775  pptr = &ptr;
3776 
3777  if(parse_query(pptr, data, flags) && expected_len == data->query_len)
3778  TRACE("(%p %p %x): Valid query component %s len=%d.\n", builder, data, flags,
3779  debugstr_wn(data->query, data->query_len), data->query_len);
3780  else {
3781  TRACE("(%p %p %x): Invalid query component %s.\n", builder, data, flags,
3782  debugstr_wn(component, expected_len));
3783  return INET_E_INVALID_URL;
3784  }
3785  }
3786 
3787  return S_OK;
3788 }
3789 
3791  const WCHAR *ptr = NULL;
3792  const WCHAR **pptr;
3793  DWORD expected_len;
3794 
3795  if(builder->fragment) {
3796  ptr = builder->fragment;
3797  expected_len = builder->fragment_len;
3798  } else if(!(builder->modified_props & Uri_HAS_FRAGMENT) && builder->uri &&
3799  builder->uri->fragment_start > -1) {
3800  ptr = builder->uri->canon_uri+builder->uri->fragment_start;
3801  expected_len = builder->uri->fragment_len;
3802  }
3803 
3804  if(ptr) {
3805  const WCHAR *component = ptr;
3806  pptr = &ptr;
3807 
3808  if(parse_fragment(pptr, data, flags) && expected_len == data->fragment_len)
3809  TRACE("(%p %p %x): Valid fragment component %s len=%d.\n", builder, data, flags,
3810  debugstr_wn(data->fragment, data->fragment_len), data->fragment_len);
3811  else {
3812  TRACE("(%p %p %x): Invalid fragment component %s.\n", builder, data, flags,
3813  debugstr_wn(component, expected_len));
3814  return INET_E_INVALID_URL;
3815  }
3816  }
3817 
3818  return S_OK;
3819 }
3820 
3822  HRESULT hr;
3823 
3824  memset(data, 0, sizeof(parse_data));
3825 
3826  TRACE("(%p %p %x): Beginning to validate builder components.\n", builder, data, flags);
3827 
3828  hr = validate_scheme_name(builder, data, flags);
3829  if(FAILED(hr))
3830  return hr;
3831 
3832  /* Extra validation for file schemes. */
3833  if(data->scheme_type == URL_SCHEME_FILE) {
3834  if((builder->password || (builder->uri && builder->uri->userinfo_split > -1)) ||
3835  (builder->username || (builder->uri && builder->uri->userinfo_start > -1))) {
3836  TRACE("(%p %p %x): File schemes can't contain a username or password.\n",
3837  builder, data, flags);
3838  return INET_E_INVALID_URL;
3839  }
3840  }
3841 
3842  hr = validate_userinfo(builder, data, flags);
3843  if(FAILED(hr))
3844  return hr;
3845 
3846  hr = validate_host(builder, data, flags);
3847  if(FAILED(hr))
3848  return hr;
3849 
3850  setup_port(builder, data, flags);
3851 
3852  /* The URI is opaque if it doesn't have an authority component. */
3853  if(!data->is_relative)
3854  data->is_opaque = !data->username && !data->password && !data->host && !data->has_port
3855  && data->scheme_type != URL_SCHEME_FILE;
3856  else
3857  data->is_opaque = !data->host && !data->has_port;
3858 
3859  hr = validate_path(builder, data, flags);
3860  if(FAILED(hr))
3861  return hr;
3862 
3863  hr = validate_query(builder, data, flags);
3864  if(FAILED(hr))
3865  return hr;
3866 
3867  hr = validate_fragment(builder, data, flags);
3868  if(FAILED(hr))
3869  return hr;
3870 
3871  TRACE("(%p %p %x): Finished validating builder components.\n", builder, data, flags);
3872 
3873  return S_OK;
3874 }
3875 
3876 static HRESULT compare_file_paths(const Uri *a, const Uri *b, BOOL *ret)
3877 {
3878  WCHAR *canon_path_a, *canon_path_b;
3879  DWORD len_a, len_b;
3880 
3881  if(!a->path_len) {
3882  *ret = !b->path_len;
3883  return S_OK;
3884  }
3885 
3886  if(!b->path_len) {
3887  *ret = FALSE;
3888  return S_OK;
3889  }
3890 
3891  /* Fast path */
3892  if(a->path_len == b->path_len && !memicmpW(a->canon_uri+a->path_start, b->canon_uri+b->path_start, a->path_len)) {
3893  *ret = TRUE;
3894  return S_OK;
3895  }
3896 
3897  len_a = canonicalize_path_hierarchical(a->canon_uri+a->path_start, a->path_len, a->scheme_type, FALSE, 0, FALSE, NULL);
3898  len_b = canonicalize_path_hierarchical(b->canon_uri+b->path_start, b->path_len, b->scheme_type, FALSE, 0, FALSE, NULL);
3899 
3900  canon_path_a = heap_alloc(len_a*sizeof(WCHAR));
3901  if(!canon_path_a)
3902  return E_OUTOFMEMORY;
3903  canon_path_b = heap_alloc(len_b*sizeof(WCHAR));
3904  if(!canon_path_b) {
3905  heap_free(canon_path_a);
3906  return E_OUTOFMEMORY;
3907  }
3908 
3909  len_a = canonicalize_path_hierarchical(a->canon_uri+a->path_start, a->path_len, a->scheme_type, FALSE, 0, FALSE, canon_path_a);
3910  len_b = canonicalize_path_hierarchical(b->canon_uri+b->path_start, b->path_len, b->scheme_type, FALSE, 0, FALSE, canon_path_b);
3911 
3912  *ret = len_a == len_b && !memicmpW(canon_path_a, canon_path_b, len_a);
3913 
3914  heap_free(canon_path_a);
3915  heap_free(canon_path_b);
3916  return S_OK;
3917 }
3918 
3919 /* Checks if the two Uri's are logically equivalent. It's a simple
3920  * comparison, since they are both of type Uri, and it can access
3921  * the properties of each Uri directly without the need to go
3922  * through the "IUri_Get*" interface calls.
3923  */
3924 static HRESULT compare_uris(const Uri *a, const Uri *b, BOOL *ret) {
3925  const BOOL known_scheme = a->scheme_type != URL_SCHEME_UNKNOWN;
3926  const BOOL are_hierarchical = a->authority_start > -1 && b->authority_start > -1;
3927  HRESULT hres;
3928 
3929  *ret = FALSE;
3930 
3931  if(a->scheme_type != b->scheme_type)
3932  return S_OK;
3933 
3934  /* Only compare the scheme names (if any) if their unknown scheme types. */
3935  if(!known_scheme) {
3936  if((a->scheme_start > -1 && b->scheme_start > -1) &&
3937  (a->scheme_len == b->scheme_len)) {
3938  /* Make sure the schemes are the same. */
3939  if(StrCmpNW(a->canon_uri+a->scheme_start, b->canon_uri+b->scheme_start, a->scheme_len))
3940  return S_OK;
3941  } else if(a->scheme_len != b->scheme_len)
3942  /* One of the Uri's has a scheme name, while the other doesn't. */
3943  return S_OK;
3944  }
3945 
3946  /* If they have a userinfo component, perform case sensitive compare. */
3947  if((a->userinfo_start > -1 && b->userinfo_start > -1) &&
3948  (a->userinfo_len == b->userinfo_len)) {
3949  if(StrCmpNW(a->canon_uri+a->userinfo_start, b->canon_uri+b->userinfo_start, a->userinfo_len))
3950  return S_OK;
3951  } else if(a->userinfo_len != b->userinfo_len)
3952  /* One of the Uri's had a userinfo, while the other one doesn't. */
3953  return S_OK;
3954 
3955  /* Check if they have a host name. */
3956  if((a->host_start > -1 && b->host_start > -1) &&
3957  (a->host_len == b->host_len)) {
3958  /* Perform a case insensitive compare if they are a known scheme type. */
3959  if(known_scheme) {
3960  if(StrCmpNIW(a->canon_uri+a->host_start, b->canon_uri+b->host_start, a->host_len))
3961  return S_OK;
3962  } else if(StrCmpNW(a->canon_uri+a->host_start, b->canon_uri+b->host_start, a->host_len))
3963  return S_OK;
3964  } else if(a->host_len != b->host_len)
3965  /* One of the Uri's had a host, while the other one didn't. */
3966  return S_OK;
3967 
3968  if(a->has_port && b->has_port) {
3969  if(a->port != b->port)
3970  return S_OK;
3971  } else if(a->has_port || b->has_port)
3972  /* One had a port, while the other one didn't. */
3973  return S_OK;
3974 
3975  /* Windows is weird with how it handles paths. For example
3976  * One URI could be "http://google.com" (after canonicalization)
3977  * and one could be "http://google.com/" and the IsEqual function
3978  * would still evaluate to TRUE, but, only if they are both hierarchical
3979  * URIs.
3980  */
3981  if(a->scheme_type == URL_SCHEME_FILE) {
3982  BOOL cmp;
3983 
3984  hres = compare_file_paths(a, b, &cmp);
3985  if(FAILED(hres) || !cmp)
3986  return hres;
3987  } else if((a->path_start > -1 && b->path_start > -1) &&
3988  (a->path_len == b->path_len)) {
3989  if(StrCmpNW(a->canon_uri+a->path_start, b->canon_uri+b->path_start, a->path_len))
3990  return S_OK;
3991  } else if(are_hierarchical && a->path_len == -1 && b->path_len == 0) {
3992  if(*(a->canon_uri+a->path_start) != '/')
3993  return S_OK;
3994  } else if(are_hierarchical && b->path_len == 1 && a->path_len == 0) {
3995  if(*(b->canon_uri+b->path_start) != '/')
3996  return S_OK;
3997  } else if(a->path_len != b->path_len)
3998  return S_OK;
3999 
4000  /* Compare the query strings of the two URIs. */
4001  if((a->query_start > -1 && b->query_start > -1) &&
4002  (a->query_len == b->query_len)) {
4003  if(StrCmpNW(a->canon_uri+a->query_start, b->canon_uri+b->query_start, a->query_len))
4004  return S_OK;
4005  } else if(a->query_len != b->query_len)
4006  return S_OK;
4007 
4008  if((a->fragment_start > -1 && b->fragment_start > -1) &&
4009  (a->fragment_len == b->fragment_len)) {
4010  if(StrCmpNW(a->canon_uri+a->fragment_start, b->canon_uri+b->fragment_start, a->fragment_len))
4011  return S_OK;
4012  } else if(a->fragment_len != b->fragment_len)
4013  return S_OK;
4014 
4015  /* If we get here, the two URIs are equivalent. */
4016  *ret = TRUE;
4017  return S_OK;
4018 }
4019 
4021  WCHAR *output, DWORD *output_len)
4022 {
4023  const WCHAR *ptr = path;
4024 
4025  if(path_len > 3 && *ptr == '/' && is_drive_path(path+1))
4026  /* Skip over the leading / before the drive path. */
4027  ++ptr;
4028 
4029  for(; ptr < path+path_len; ++ptr) {
4030  if(*ptr == '/') {
4031  if(output)
4032  *output++ = '\\';
4033  (*output_len)++;
4034  } else {
4035  if(output)
4036  *output++ = *ptr;
4037  (*output_len)++;
4038  }
4039  }
4040 }
4041 
4042 /* Generates a raw uri string using the parse_data. */
4044  DWORD length = 0;
4045 
4046  if(data->scheme) {
4047  if(uri) {
4048  memcpy(uri, data->scheme, data->scheme_len*sizeof(WCHAR));
4049