001package co.codewizards.cloudstore.core.util; 002 003import static java.util.Objects.*; 004 005import java.io.CharArrayWriter; 006import java.io.UnsupportedEncodingException; 007import java.nio.charset.Charset; 008import java.nio.charset.IllegalCharsetNameException; 009import java.nio.charset.StandardCharsets; 010import java.nio.charset.UnsupportedCharsetException; 011import java.util.BitSet; 012 013/** 014 * URL-encoder encoding all special characters (that cannot be left unchanged) as "%...". 015 * <p> 016 * In contrast to the {@link java.net.URLEncoder URLEncoder}, this class does <b>not</b> encode 017 * ' ' (space) space as '+' (plus)! 018 * <p> 019 * Additionally, this class does not use the default encoding, but always UTF-8, if not specified 020 * otherwise. 021 * <p> 022 * The reason for this class is that {@link java.io.File#toURI() File.toURI()} 023 * does not encode a "+" sign. Therefore, our URL-encoding and decoding must 024 * not handle the "+" specifically. 025 * <p> 026 * Another reason is <a href="https://java.net/jira/browse/JERSEY-417">JERSEY-417</a>. 027 * I originally used {@code org.glassfish.jersey.uri.UriComponent.encode(String, Type)} 028 * at some code locations, but since not all code locations have a dependency on Jersey, 029 * I decided to switch consistently everywhere to {@code UrlEncoder} and {@link UrlDecoder}. 030 * <p> 031 * This class was copied from {@link java.net.URLEncoder URLEncoder} and changed to fit our needs. 032 * @see UrlDecoder 033 * @author Marco หงุ่ยตระกูล-Schulze - marco at codewizards dot co 034 */ 035public final class UrlEncoder { 036 037 private UrlEncoder() { 038 } 039 040 static BitSet dontNeedEncoding; 041 static final int caseDiff = ('a' - 'A'); 042 043 static { 044 045 /* The list of characters that are not encoded has been 046 * determined as follows: 047 * 048 * RFC 2396 states: 049 * ----- 050 * Data characters that are allowed in a URI but do not have a 051 * reserved purpose are called unreserved. These include upper 052 * and lower case letters, decimal digits, and a limited set of 053 * punctuation marks and symbols. 054 * 055 * unreserved = alphanum | mark 056 * 057 * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" 058 * 059 * Unreserved characters can be escaped without changing the 060 * semantics of the URI, but this should not be done unless the 061 * URI is being used in a context that does not allow the 062 * unescaped character to appear. 063 * ----- 064 * 065 * It appears that both Netscape and Internet Explorer escape 066 * all special characters from this list with the exception 067 * of "-", "_", ".", "*". While it is not clear why they are 068 * escaping the other characters, perhaps it is safest to 069 * assume that there might be contexts in which the others 070 * are unsafe if not escaped. Therefore, we will use the same 071 * list. It is also noteworthy that this is consistent with 072 * O'Reilly's "HTML: The Definitive Guide" (page 164). 073 * 074 * As a last note, Intenet Explorer does not encode the "@" 075 * character which is clearly not unreserved according to the 076 * RFC. We are being consistent with the RFC in this matter, 077 * as is Netscape. 078 * 079 */ 080 081 dontNeedEncoding = new BitSet(256); 082 int i; 083 for (i = 'a'; i <= 'z'; i++) { 084 dontNeedEncoding.set(i); 085 } 086 for (i = 'A'; i <= 'Z'; i++) { 087 dontNeedEncoding.set(i); 088 } 089 for (i = '0'; i <= '9'; i++) { 090 dontNeedEncoding.set(i); 091 } 092 dontNeedEncoding.set('-'); 093 dontNeedEncoding.set('_'); 094 dontNeedEncoding.set('.'); 095 dontNeedEncoding.set('*'); 096 } 097 098 099 /** 100 * Translates a string into {@code application/x-www-form-urlencoded} 101 * format using UTF-8. 102 * @param s {@code String} to be translated. 103 */ 104 public static String encode(String s) { 105 String str = encode(s, StandardCharsets.UTF_8); 106 return str; 107 } 108 109 /** 110 * Translates a string into {@code application/x-www-form-urlencoded} 111 * format using a specific encoding scheme. This method uses the 112 * supplied encoding scheme to obtain the bytes for unsafe 113 * characters. 114 * <p> 115 * <em><strong>Note:</strong> The <a href= 116 * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"> 117 * World Wide Web Consortium Recommendation</a> states that 118 * UTF-8 should be used. Not doing so may introduce 119 * incompatibilities.</em> 120 * 121 * @param s {@code String} to be translated. 122 * @param enc The name of a supported 123 * <a href="../lang/package-summary.html#charenc">character 124 * encoding</a>. 125 * @return the translated {@code String}. 126 * @exception UnsupportedEncodingException 127 * If the named encoding is not supported 128 * @see UrlDecoder#decode(String, String) 129 * @deprecated UTF-8 should be used; it is thus recommended to invoke {@link #encode(String)} instead. 130 */ 131 @Deprecated 132 public static String encode(String s, String enc) throws UnsupportedEncodingException { 133 requireNonNull(s, "s"); 134 requireNonNull(enc, "enc"); 135 Charset charset; 136 try { 137 charset = Charset.forName(enc); 138 } catch (IllegalCharsetNameException e) { 139 throw new UnsupportedEncodingException(enc); 140 } catch (UnsupportedCharsetException e) { 141 throw new UnsupportedEncodingException(enc); 142 } 143 return encode(s, charset); 144 } 145 146 /** 147 * Translates a string into {@code application/x-www-form-urlencoded} 148 * format using a specific encoding scheme. This method uses the 149 * supplied encoding scheme to obtain the bytes for unsafe 150 * characters. 151 * <p> 152 * <em><strong>Note:</strong> The <a href= 153 * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"> 154 * World Wide Web Consortium Recommendation</a> states that 155 * UTF-8 should be used. Not doing so may introduce 156 * incompatibilities.</em> 157 * 158 * @param s {@code String} to be translated. 159 * @param charset The <a href="../lang/package-summary.html#charenc">character encoding</a>. 160 * @return the translated {@code String}. 161 * @exception UnsupportedEncodingException 162 * If the named encoding is not supported 163 * @see UrlDecoder#decode(String, Charset) 164 * @deprecated UTF-8 should be used; it is thus recommended to invoke {@link #encode(String)} instead. 165 */ 166 @Deprecated 167 public static String encode(String s, Charset charset) { 168 requireNonNull(s, "s"); 169 requireNonNull(charset, "charset"); 170 171 boolean needToChange = false; 172 StringBuffer out = new StringBuffer(s.length()); 173 CharArrayWriter charArrayWriter = new CharArrayWriter(); 174 175 for (int i = 0; i < s.length();) { 176 int c = s.charAt(i); 177 //System.out.println("Examining character: " + c); 178 if (dontNeedEncoding.get(c)) { 179 //System.out.println("Storing: " + c); 180 out.append((char)c); 181 i++; 182 } else { 183 // convert to external encoding before hex conversion 184 do { 185 charArrayWriter.write(c); 186 /* 187 * If this character represents the start of a Unicode 188 * surrogate pair, then pass in two characters. It's not 189 * clear what should be done if a bytes reserved in the 190 * surrogate pairs range occurs outside of a legal 191 * surrogate pair. For now, just treat it as if it were 192 * any other character. 193 */ 194 if (c >= 0xD800 && c <= 0xDBFF) { 195 /* 196 System.out.println(Integer.toHexString(c) 197 + " is high surrogate"); 198 */ 199 if ( (i+1) < s.length()) { 200 int d = s.charAt(i+1); 201 /* 202 System.out.println("\tExamining " 203 + Integer.toHexString(d)); 204 */ 205 if (d >= 0xDC00 && d <= 0xDFFF) { 206 /* 207 System.out.println("\t" 208 + Integer.toHexString(d) 209 + " is low surrogate"); 210 */ 211 charArrayWriter.write(d); 212 i++; 213 } 214 } 215 } 216 i++; 217 } while (i < s.length() && !dontNeedEncoding.get((c = s.charAt(i)))); 218 219 charArrayWriter.flush(); 220 String str = new String(charArrayWriter.toCharArray()); 221 byte[] ba = str.getBytes(charset); 222 for (int j = 0; j < ba.length; j++) { 223 out.append('%'); 224 char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16); 225 // converting to use uppercase letter as part of 226 // the hex value if ch is a letter. 227 if (Character.isLetter(ch)) { 228 ch -= caseDiff; 229 } 230 out.append(ch); 231 ch = Character.forDigit(ba[j] & 0xF, 16); 232 if (Character.isLetter(ch)) { 233 ch -= caseDiff; 234 } 235 out.append(ch); 236 } 237 charArrayWriter.reset(); 238 needToChange = true; 239 } 240 } 241 return (needToChange? out.toString() : s); 242 } 243}