/** * '$RCSfile$' * Copyright: 2009 University of New Mexico and the * Regents of the University of California * * '$Author: costa $' * '$Date: 2009-07-27 17:47:44 -0400 (Mon, 27 Jul 2009) $' * '$Revision: 4999 $' * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Additional Copyright 2006 OCLC, Online Computer Library Center * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.ucsb.nceas.metacat.oaipmh.harvester; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.net.HttpURLConnection; import java.net.URL; import java.util.Date; import java.util.HashMap; import java.util.StringTokenizer; import java.util.zip.GZIPInputStream; import java.util.zip.InflaterInputStream; import java.util.zip.ZipInputStream; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Result; import javax.xml.transform.Source; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.log4j.Logger; import org.apache.xpath.XPathAPI; import org.w3c.dom.DOMImplementation; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * HarvesterVerb is the parent class for each of the OAI verbs. * * @author Duane Costa, University of New Mexico, LTER Network Office * @author Jeffrey A. Young, OCLC Online Computer Library Center */ public abstract class HarvesterVerb { /* Class variables */ private static Logger logger = Logger.getLogger(HarvesterVerb.class); public static final String SCHEMA_LOCATION_V2_0 = "http://www.openarchives.org/OAI/2.0/ " + "http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"; private static HashMap builderMap = new HashMap(); private static DocumentBuilderFactory documentBuilderFactory = null; private static Element namespaceElement = null; private static TransformerFactory transformerFactory = TransformerFactory.newInstance(); /* Instance variables */ private Document document = null; private String schemaLocation = null; private String requestURL = null; /* Constructors */ /** * Mock object creator (for unit testing purposes) */ public HarvesterVerb() { } /** * Performs the OAI request * * @param requestURL * @throws IOException * @throws ParserConfigurationException * @throws SAXException * @throws TransformerException */ public HarvesterVerb(String requestURL) throws IOException, ParserConfigurationException, SAXException, TransformerException { this.requestURL = requestURL; } /* Static initialization code */ static { try { /* Load DOM Document */ documentBuilderFactory = DocumentBuilderFactory.newInstance(); documentBuilderFactory.setNamespaceAware(true); Thread thread = Thread.currentThread(); DocumentBuilder builder = documentBuilderFactory.newDocumentBuilder(); builderMap.put(thread, builder); DOMImplementation impl = builder.getDOMImplementation(); Document namespaceHolder = impl.createDocument( "http://www.oclc.org/research/software/oai/harvester", "harvester:namespaceHolder", null ); namespaceElement = namespaceHolder.getDocumentElement(); namespaceElement.setAttributeNS( "http://www.w3.org/2000/xmlns/", "xmlns:harvester", "http://www.oclc.org/research/software/oai/harvester" ); namespaceElement.setAttributeNS( "http://www.w3.org/2000/xmlns/", "xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance" ); namespaceElement.setAttributeNS( "http://www.w3.org/2000/xmlns/", "xmlns:oai20", "http://www.openarchives.org/OAI/2.0/" ); } catch (Exception e) { e.printStackTrace(); } } /* Instance methods */ /* Primary OAI namespaces */ /** * Get the OAI response as a DOM object * * @return the DOM for the OAI response */ public Document getDocument() { return document; } /** * Get the OAI errors * * @return a NodeList of /oai:OAI-PMH/oai:error elements * @throws TransformerException */ public NodeList getErrors() throws TransformerException { if (SCHEMA_LOCATION_V2_0.equals(getSchemaLocation())) { return getNodeList("/oai20:OAI-PMH/oai20:error"); } else { return null; } } /** * Get a NodeList containing the nodes in the response DOM for the specified * xpath * * @param xpath * @return the NodeList for the xpath into the response DOM * @throws TransformerException */ public NodeList getNodeList(String xpath) throws TransformerException { Document document = getDocument(); return XPathAPI.selectNodeList(document, xpath, namespaceElement); } /** * Get the OAI request URL for this response * * @return the OAI request URL as a String */ public String getRequestURL() { return requestURL; } /** * Get the xsi:schemaLocation for the OAI response * * @return the xsi:schemaLocation value */ public String getSchemaLocation() { return schemaLocation; } /** * Get the String value for the given XPath location in the response DOM * * @param xpath * @return a String containing the value of the XPath location. * @throws TransformerException */ public String getSingleString(String xpath) throws TransformerException { Document document = getDocument(); org.apache.xpath.objects.XObject xobject; xobject = XPathAPI.eval(document, xpath, namespaceElement); String str = xobject.str(); return str; } /** * Preforms the OAI request for this OAI-PMH verb * * @throws IOException * @throws ParserConfigurationException * @throws SAXException * @throws TransformerException */ public void runVerb() throws IOException, ParserConfigurationException, SAXException, TransformerException { //logger.debug("requestURL=" + requestURL); InputStream in = null; URL url = new URL(requestURL); HttpURLConnection con = null; int responseCode = 0; do { con = (HttpURLConnection) url.openConnection(); con.setRequestProperty("User-Agent", "OAIHarvester/2.0"); con.setRequestProperty("Accept-Encoding", "compress, gzip, identify"); try { responseCode = con.getResponseCode(); //logger.debug("responseCode=" + responseCode); } catch (FileNotFoundException e) { // assume it's a 503 response logger.info(requestURL, e); responseCode = HttpURLConnection.HTTP_UNAVAILABLE; } if (responseCode == HttpURLConnection.HTTP_UNAVAILABLE) { long retrySeconds = con.getHeaderFieldInt("Retry-After", -1); if (retrySeconds == -1) { long now = (new Date()).getTime(); long retryDate = con.getHeaderFieldDate("Retry-After", now); retrySeconds = retryDate - now; } if (retrySeconds == 0) { // Apparently, it's a bad URL throw new FileNotFoundException("Bad URL?"); } System.err.println("Server response: Retry-After=" + retrySeconds); if (retrySeconds > 0) { try { Thread.sleep(retrySeconds * 1000); } catch (InterruptedException ex) { ex.printStackTrace(); } } } } while (responseCode == HttpURLConnection.HTTP_UNAVAILABLE); String contentEncoding = con.getHeaderField("Content-Encoding"); //logger.debug("contentEncoding=" + contentEncoding); if ("compress".equals(contentEncoding)) { ZipInputStream zis = new ZipInputStream(con.getInputStream()); zis.getNextEntry(); in = zis; } else if ("gzip".equals(contentEncoding)) { in = new GZIPInputStream(con.getInputStream()); } else if ("deflate".equals(contentEncoding)) { in = new InflaterInputStream(con.getInputStream()); } else { in = con.getInputStream(); } InputSource data = new InputSource(in); Thread t = Thread.currentThread(); DocumentBuilder builder = builderMap.get(t); if (builder == null) { builder = documentBuilderFactory.newDocumentBuilder(); builderMap.put(t, builder); } document = builder.parse(data); String singleString = getSingleString("/*/@xsi:schemaLocation"); StringTokenizer tokenizer = new StringTokenizer(singleString, " "); StringBuffer sb = new StringBuffer(); while (tokenizer.hasMoreTokens()) { if (sb.length() > 0) sb.append(" "); sb.append(tokenizer.nextToken()); } String schemaLocationStr = sb.toString(); this.schemaLocation = schemaLocationStr; } /** * Transform the document content to a string and return it. * * @return returnString - the string that results from transforming the * document */ public String toString() { Document document = getDocument(); Source source = new DOMSource(document); StringWriter stringWriter = new StringWriter(); Result result = new StreamResult(stringWriter); try { Transformer idTransformer = transformerFactory.newTransformer(); idTransformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no"); idTransformer.transform(source, result); String returnString = stringWriter.toString(); return returnString; } catch (TransformerException e) { return e.getMessage(); } } }