001package co.codewizards.cloudstore.core.util;
002
003import static java.util.Objects.*;
004
005import java.io.CharArrayWriter;
006import java.io.UnsupportedEncodingException;
007import java.nio.charset.Charset;
008import java.nio.charset.IllegalCharsetNameException;
009import java.nio.charset.StandardCharsets;
010import java.nio.charset.UnsupportedCharsetException;
011import java.util.BitSet;
012
013/**
014 * URL-encoder encoding all special characters (that cannot be left unchanged) as "%...".
015 * <p>
016 * In contrast to the {@link java.net.URLEncoder URLEncoder}, this class does <b>not</b> encode
017 * ' ' (space) space as '+' (plus)!
018 * <p>
019 * Additionally, this class does not use the default encoding, but always UTF-8, if not specified
020 * otherwise.
021 * <p>
022 * The reason for this class is that {@link java.io.File#toURI() File.toURI()}
023 * does not encode a "+" sign. Therefore, our URL-encoding and decoding must
024 * not handle the "+" specifically.
025 * <p>
026 * Another reason is <a href="https://java.net/jira/browse/JERSEY-417">JERSEY-417</a>.
027 * I originally used {@code org.glassfish.jersey.uri.UriComponent.encode(String, Type)}
028 * at some code locations, but since not all code locations have a dependency on Jersey,
029 * I decided to switch consistently everywhere to {@code UrlEncoder} and {@link UrlDecoder}.
030 * <p>
031 * This class was copied from {@link java.net.URLEncoder URLEncoder} and changed to fit our needs.
032 * @see UrlDecoder
033 * @author Marco หงุ่ยตระกูล-Schulze - marco at codewizards dot co
034 */
035public final class UrlEncoder {
036
037        private UrlEncoder() {
038        }
039
040    static BitSet dontNeedEncoding;
041    static final int caseDiff = ('a' - 'A');
042
043    static {
044
045        /* The list of characters that are not encoded has been
046         * determined as follows:
047         *
048         * RFC 2396 states:
049         * -----
050         * Data characters that are allowed in a URI but do not have a
051         * reserved purpose are called unreserved.  These include upper
052         * and lower case letters, decimal digits, and a limited set of
053         * punctuation marks and symbols.
054         *
055         * unreserved  = alphanum | mark
056         *
057         * mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
058         *
059         * Unreserved characters can be escaped without changing the
060         * semantics of the URI, but this should not be done unless the
061         * URI is being used in a context that does not allow the
062         * unescaped character to appear.
063         * -----
064         *
065         * It appears that both Netscape and Internet Explorer escape
066         * all special characters from this list with the exception
067         * of "-", "_", ".", "*". While it is not clear why they are
068         * escaping the other characters, perhaps it is safest to
069         * assume that there might be contexts in which the others
070         * are unsafe if not escaped. Therefore, we will use the same
071         * list. It is also noteworthy that this is consistent with
072         * O'Reilly's "HTML: The Definitive Guide" (page 164).
073         *
074         * As a last note, Intenet Explorer does not encode the "@"
075         * character which is clearly not unreserved according to the
076         * RFC. We are being consistent with the RFC in this matter,
077         * as is Netscape.
078         *
079         */
080
081        dontNeedEncoding = new BitSet(256);
082        int i;
083        for (i = 'a'; i <= 'z'; i++) {
084            dontNeedEncoding.set(i);
085        }
086        for (i = 'A'; i <= 'Z'; i++) {
087            dontNeedEncoding.set(i);
088        }
089        for (i = '0'; i <= '9'; i++) {
090            dontNeedEncoding.set(i);
091        }
092        dontNeedEncoding.set('-');
093        dontNeedEncoding.set('_');
094        dontNeedEncoding.set('.');
095        dontNeedEncoding.set('*');
096    }
097
098
099    /**
100     * Translates a string into {@code application/x-www-form-urlencoded}
101     * format using UTF-8.
102     * @param   s   {@code String} to be translated.
103     */
104    public static String encode(String s) {
105        String str = encode(s, StandardCharsets.UTF_8);
106        return str;
107    }
108
109    /**
110     * Translates a string into {@code application/x-www-form-urlencoded}
111     * format using a specific encoding scheme. This method uses the
112     * supplied encoding scheme to obtain the bytes for unsafe
113     * characters.
114     * <p>
115     * <em><strong>Note:</strong> The <a href=
116     * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars">
117     * World Wide Web Consortium Recommendation</a> states that
118     * UTF-8 should be used. Not doing so may introduce
119     * incompatibilities.</em>
120     *
121     * @param   s   {@code String} to be translated.
122     * @param   enc   The name of a supported
123     *    <a href="../lang/package-summary.html#charenc">character
124     *    encoding</a>.
125     * @return  the translated {@code String}.
126     * @exception  UnsupportedEncodingException
127     *             If the named encoding is not supported
128     * @see UrlDecoder#decode(String, String)
129     * @deprecated UTF-8 should be used; it is thus recommended to invoke {@link #encode(String)} instead.
130     */
131    @Deprecated
132        public static String encode(String s, String enc) throws UnsupportedEncodingException {
133        requireNonNull(s, "s");
134        requireNonNull(enc, "enc");
135        Charset charset;
136        try {
137            charset = Charset.forName(enc);
138        } catch (IllegalCharsetNameException e) {
139            throw new UnsupportedEncodingException(enc);
140        } catch (UnsupportedCharsetException e) {
141            throw new UnsupportedEncodingException(enc);
142        }
143        return encode(s, charset);
144    }
145
146    /**
147     * Translates a string into {@code application/x-www-form-urlencoded}
148     * format using a specific encoding scheme. This method uses the
149     * supplied encoding scheme to obtain the bytes for unsafe
150     * characters.
151     * <p>
152     * <em><strong>Note:</strong> The <a href=
153     * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars">
154     * World Wide Web Consortium Recommendation</a> states that
155     * UTF-8 should be used. Not doing so may introduce
156     * incompatibilities.</em>
157     *
158     * @param   s   {@code String} to be translated.
159     * @param   charset   The <a href="../lang/package-summary.html#charenc">character encoding</a>.
160     * @return  the translated {@code String}.
161     * @exception  UnsupportedEncodingException
162     *             If the named encoding is not supported
163     * @see UrlDecoder#decode(String, Charset)
164     * @deprecated UTF-8 should be used; it is thus recommended to invoke {@link #encode(String)} instead.
165     */
166    @Deprecated
167        public static String encode(String s, Charset charset) {
168        requireNonNull(s, "s");
169        requireNonNull(charset, "charset");
170
171        boolean needToChange = false;
172        StringBuffer out = new StringBuffer(s.length());
173        CharArrayWriter charArrayWriter = new CharArrayWriter();
174
175        for (int i = 0; i < s.length();) {
176            int c = s.charAt(i);
177            //System.out.println("Examining character: " + c);
178            if (dontNeedEncoding.get(c)) {
179                //System.out.println("Storing: " + c);
180                out.append((char)c);
181                i++;
182            } else {
183                // convert to external encoding before hex conversion
184                do {
185                    charArrayWriter.write(c);
186                    /*
187                     * If this character represents the start of a Unicode
188                     * surrogate pair, then pass in two characters. It's not
189                     * clear what should be done if a bytes reserved in the
190                     * surrogate pairs range occurs outside of a legal
191                     * surrogate pair. For now, just treat it as if it were
192                     * any other character.
193                     */
194                    if (c >= 0xD800 && c <= 0xDBFF) {
195                        /*
196                          System.out.println(Integer.toHexString(c)
197                          + " is high surrogate");
198                        */
199                        if ( (i+1) < s.length()) {
200                            int d = s.charAt(i+1);
201                            /*
202                              System.out.println("\tExamining "
203                              + Integer.toHexString(d));
204                            */
205                            if (d >= 0xDC00 && d <= 0xDFFF) {
206                                /*
207                                  System.out.println("\t"
208                                  + Integer.toHexString(d)
209                                  + " is low surrogate");
210                                */
211                                charArrayWriter.write(d);
212                                i++;
213                            }
214                        }
215                    }
216                    i++;
217                } while (i < s.length() && !dontNeedEncoding.get((c = s.charAt(i))));
218
219                charArrayWriter.flush();
220                String str = new String(charArrayWriter.toCharArray());
221                byte[] ba = str.getBytes(charset);
222                for (int j = 0; j < ba.length; j++) {
223                    out.append('%');
224                    char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16);
225                    // converting to use uppercase letter as part of
226                    // the hex value if ch is a letter.
227                    if (Character.isLetter(ch)) {
228                        ch -= caseDiff;
229                    }
230                    out.append(ch);
231                    ch = Character.forDigit(ba[j] & 0xF, 16);
232                    if (Character.isLetter(ch)) {
233                        ch -= caseDiff;
234                    }
235                    out.append(ch);
236                }
237                charArrayWriter.reset();
238                needToChange = true;
239            }
240        }
241        return (needToChange? out.toString() : s);
242    }
243}