/** * This work was created by participants in the DataONE project, and is * jointly copyrighted by participating institutions in DataONE. For * more information on DataONE, see our web site at http://dataone.org. * * Copyright ${year} * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * $Id$ */ package org.dataone.service.util; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.BitSet; /* Encoding routines Adapted from: (http://www.java2s.com/Code/Java/Network-Protocol/EncodeapathasrequiredbytheURLspecification.htm) Derby - Class org.apache.derby.iapi.util.PropertyUtil Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. * * * Decoding routines adapted from: * http://www.java2s.com/Code/Java/Network-Protocol/Requestparsingandencodingutilitymethods.htm * * distributed with the following license: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Useful resources for background on encoding http://www.java-forums.org/new-java/350-how-obtain-ascii-code-character.html http://stackoverflow.com/questions/1527856/how-can-i-iterate-through-the-unicode-codepoints-of-a-java-string http://purl.dataone.org/architecture/design/PIDs.html http://download.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html#unicode http://www.joelonsoftware.com/articles/Unicode.html http://www.utf8-chartable.de/unicode-utf8-table.pl */ public class EncodingUtilities { /** * Array containing the allowable characters for 'pchar' set as defined by RFC 3986 ABNF: * pchar = unreserved / pct-encoded / sub-delims / ":" / "@" * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" */ private static BitSet pcharUnescapedCharacters; private static BitSet fragmentUnescapedCharacters; private static BitSet queryUnescapedCharacters; private static final char[] hexadecimal = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; static { pcharUnescapedCharacters = new BitSet(256); int i; // unreserved // ALPHA for (i = 'a'; i <= 'z'; i++) pcharUnescapedCharacters.set(i); for (i = 'A'; i <= 'Z'; i++) pcharUnescapedCharacters.set(i); // DIGIT for (i = '0'; i <= '9'; i++) pcharUnescapedCharacters.set(i); // other pcharUnescapedCharacters.set('-'); pcharUnescapedCharacters.set('_'); pcharUnescapedCharacters.set('.'); pcharUnescapedCharacters.set('~'); // sub-delims pcharUnescapedCharacters.set('!'); pcharUnescapedCharacters.set('$'); pcharUnescapedCharacters.set('&'); pcharUnescapedCharacters.set('\''); pcharUnescapedCharacters.set('('); pcharUnescapedCharacters.set(')'); pcharUnescapedCharacters.set('*'); // some older URL parsers replace '+' with a space // so we'll escape this character to avoid mis-interpretation by clients // (removing it from the unescaped list) // pcharUnescapedCharacters.set('+'); pcharUnescapedCharacters.set(','); pcharUnescapedCharacters.set('='); // Apache/mod_jk doesn't like ; due to the security risk // (removing it from the unescaped list) // pcharUnescapedCharacters.set(';'); // allowable from general delimiters set pcharUnescapedCharacters.set(':'); pcharUnescapedCharacters.set('@'); // set up fragmentUnescapedCharacters - a superset of pchar fragmentUnescapedCharacters = (BitSet) pcharUnescapedCharacters.clone(); fragmentUnescapedCharacters.set('/'); fragmentUnescapedCharacters.set('?'); fragmentUnescapedCharacters.set(';'); // set up queryUnescapedCharacters - in ABNF, is the same as frag // but have to remove a couple character to follow key-value pair convention // (the ABNF allows those, but they have a reserved purpose within the query part // of the URL that the ABNF does not define queryUnescapedCharacters = (BitSet) fragmentUnescapedCharacters.clone(); queryUnescapedCharacters.clear('='); queryUnescapedCharacters.clear('&'); } /** * Encode a URL path segment as required by the URL specifications ( * RFC 3986 and RFC 1738 section 3.3). * This differs from java.net.URLEncoder.encode() which encodes according * to the x-www-form-urlencoded MIME format. * * @param segmentString: the http URL path segment to encode * @return the URL-path-segment-safe UTF-8 encoded string */ public static String encodeUrlPathSegment(String segmentString) { return encodeString(pcharUnescapedCharacters, segmentString); } /** * Encode a URL query segment following the item-list / key-value convention: * query = "?" q-item *( "&" q-item) * q=item = q-segment [ "=" q-segment ] * * @param segmentString: the http URL query segment to encode * @return the URL-query-segment-safe UTF-8 encoded string */ public static String encodeUrlQuerySegment(String segmentString) { return encodeString(queryUnescapedCharacters, segmentString); } /** * Encode the URL fragment section following * RFC 3986. * * @param fragmentString * @return the URL-fragment-safe UTF-8 encoded string */ public static String encodeUrlFragment(String fragmentString) { return encodeString(fragmentUnescapedCharacters, fragmentString); } /** * Encode a path segment as required by the URL specification ( * RFC 3986). This differs from java.net.URLEncoder.encode() which encodes according * to the x-www-form-urlencoded MIME format. * * @param idString the identifier to encode (operating as a segment, according to the ABNF) * @return the URL-safe UTF-8 encoded identifier */ private static String encodeString(BitSet unescapedSet, String idString) { // replaced StringBuffer with StringBuilder, as per recommendation in javadocs, for higher performance int maxBytesPerChar = 10; StringBuilder rewrittenPathSegment = new StringBuilder(idString.length()); ByteArrayOutputStream buf = new ByteArrayOutputStream(maxBytesPerChar); OutputStreamWriter writer; try { writer = new OutputStreamWriter(buf, "UTF-8"); } catch (Exception e) { e.printStackTrace(); // probably don't want to continue writer = new OutputStreamWriter(buf); } for (int i = 0; i < idString.length(); i++) { int c = idString.charAt(i); if (unescapedSet.get(c)) { rewrittenPathSegment.append((char)c); } else { // convert to external encoding (UTF-8) before hex conversion try { writer.write(c); writer.flush(); } catch(IOException e) { buf.reset(); continue; } byte[] ba = buf.toByteArray(); for (int j = 0; j < ba.length; j++) { // Converting each byte in the buffer byte toEncode = ba[j]; rewrittenPathSegment.append('%'); int low = (toEncode & 0x0f); int high = ((toEncode & 0xf0) >> 4); rewrittenPathSegment.append(hexadecimal[high]); rewrittenPathSegment.append(hexadecimal[low]); } buf.reset(); } } return rewrittenPathSegment.toString(); } /** * Decode and return the specified pct-encoded String. UTF-8 encoding * is assumed. For compatibility with older encoders, '+' characters * in the input String will be decoded as spaces. Has no effect on * * @param string * @return the decoded string * @throws UnsupportedEncodingException * @exception IllegalArgumentException if a '%' character is not followed * by a valid 2-digit hexadecimal number */ public static String decodeString(String string) throws UnsupportedEncodingException { if (string == null) { return null; } // a string containing a '+' should not reach here // but if it does, don't want to fail string = string.replace("+", "%2B"); return URLDecoder.decode(string,"UTF-8"); } // TODO: create tests for this public static String decodeXmlDataItems(String dataString) { String decodedString; decodedString = dataString.replaceAll(">",">"); decodedString = decodedString.replaceAll("<","<"); decodedString = decodedString.replaceAll("&","&"); decodedString = decodedString.replaceAll("'","'"); decodedString = decodedString.replaceAll(""","\""); return decodedString; } }