/** * '$RCSfile$' * Copyright: 2007 Regents of the University of California and the * National Center for Ecological Analysis and Synthesis * * '$Author$' * '$Date$' * '$Revision$' * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package edu.ucsb.nceas.metacat; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.TimerTask; import java.net.URLEncoder; import org.apache.log4j.Logger; import org.apache.commons.lang.StringEscapeUtils; import edu.ucsb.nceas.metacat.database.DBConnection; import edu.ucsb.nceas.metacat.database.DBConnectionPool; import edu.ucsb.nceas.metacat.properties.PropertyService; import edu.ucsb.nceas.utilities.PropertyNotFoundException; /** * A Sitemap represents a document that lists all of the content of the Metacat * server for use by harvesting spiders that wish to index the contents of the * Metacat site. It is used to generate an XML representation of all of the URLs * of the site in order to facilitate indexing of the metacat site by search * engines. * * Which objects are included? * * - Only documents with public read permission are included * - Only documents with object_formats in the xml_catalog table are included * - All non-obsoleted metadata objects are included in the sitemap(s) * * Other notes: * * - The sitemaps this class generates are intended to be served another * application such as MetacatUI * - A sitemap index is generated regardless of the number of URLs present * - URLs for the location of the sitemaps and the entries themselves are * controlled by the 'sitemap.location.base' and 'sitemap.entry.base' * properties which can be full URLs or absolute paths. * * - sitemap.location.base controls first part of the URLs in the sitemap * index * - sitemap.entry.base controls the first part of the URLs in the sitemap * files themselves * * @author Matt Jones * @author Bryce Mecum */ public class Sitemap extends TimerTask { private static Logger logMetacat = Logger.getLogger(Sitemap.class); /** * Construct a new instance of the Sitemap. * * @param directory * the location to store sitemap files * @param locationBase * the base URL for constructing sitemap location URLs * @param entryBase * the base URL for constructing sitemap entry URLs * */ public Sitemap(File directory, String locationBase, String entryBase) { super(); this.directory = directory; this.locationBase = locationBase; this.entryBase = entryBase; } /** * Execute the timed task when called, in this case by generating the * sitemap files needed for this Metacat instance. */ public void run() { generateSitemaps(); } /** * Generate all of the sitemap files needed to list the URLs from this * instance of Metacat, using the open sitemap format described here: * http://www.sitemaps.org/protocol.html * URLs are written to a single file, unless the maximum number of URLs * allowed in the sitemap file is exceeded, in which subsequent numbered * files are created. An index of the sitemaps is also created. * * The sitemap index can be registered with search index providers such as * Google, but beware that it needs to be accessible in a location above the * mount point for the service URLs. By default the files are placed in * {context}/sitemaps, but you will need to expose them at a location * matching what's set in the sitemap.location.base and sitemap.entry.base * properties in order to be trusted by Google. See the Sitemaps.org * documentation for details. */ public void generateSitemaps() { logMetacat.info("Running the Sitemap task. Directory is " + directory + " and locationBase is " + locationBase +"."); // Test if the passed in File is a directory if (directory.isDirectory()) { // Query xml_documents to get list of documents StringBuffer query = new StringBuffer(); /** Query for documents that are: * - Metadata (their object_format is in the xml_catalog) * - Latest/head versions (their obsoleted_by field is NULL) * - Publicly readable (their access policy has a public + read perm) */ // We use a subquery to filter documents based upon whether they use // a format ID in the xml_catalog table String metadata_formats = "SELECT public_id from xml_catalog " + "WHERE public_id is not NULL"; String entries = "SELECT identifier.guid as pid " + "FROM identifier " + "LEFT JOIN systemmetadata on identifier.guid = systemmetadata.guid " + "LEFT JOIN xml_access on identifier.guid = xml_access.guid " + "WHERE " + "systemmetadata.object_format in (" + metadata_formats + ") AND " + "systemmetadata.obsoleted_by is NULL AND " + "systemmetadata.archived = FALSE AND " + "xml_access.principal_name = 'public' AND " + "xml_access.perm_type = 'allow' " + "ORDER BY systemmetadata.date_uploaded ASC;"; query.append(entries); DBConnection dbConn = null; int serialNumber = -1; try { // Get a database connection from the pool dbConn = DBConnectionPool .getDBConnection("Sitemap.generateSitemap()"); serialNumber = dbConn.getCheckOutSerialNumber(); // Execute the query statement PreparedStatement stmt = dbConn.prepareStatement(query.toString()); stmt.execute(); ResultSet rs = stmt.getResultSet(); // Loop through all of the documents, and write them to a // sitemap File sitemapFile = null; OutputStreamWriter sitemap = null; int counter = 0; int fileNumber = 0; while (rs.next()) { // Check if a new sitemap file needs to be created if (counter % MAX_URLS_IN_FILE == 0) { // if a sitemap file is already open if (sitemapFile != null && sitemapFile.canWrite()) { // write the footer and close the file writeSitemapFooter(sitemap); } // Open a new sitemap file for writing fileNumber++; sitemapFile = new File(directory, fileRoot + fileNumber + ".xml"); sitemap = new OutputStreamWriter(new FileOutputStream(sitemapFile), Charset.forName("UTF-8")); // Write the sitemap document header for the new file writeSitemapHeader(sitemap); } writeSitemapEntry(sitemap, rs.getString(1)); counter++; } stmt.close(); writeSitemapFooter(sitemap); writeSitemapIndex(fileNumber); } catch (SQLException e) { logMetacat.warn("Error while writing to the sitemap file: " + e.getMessage()); } catch (IOException ioe) { logMetacat.warn("Could not open or write to the sitemap file." + ioe.getMessage()); } finally { // Return database connection to the pool DBConnectionPool.returnDBConnection(dbConn, serialNumber); } } else { logMetacat.warn("Sitemap not created because directory not valid."); } } /** * Write the header information in a single sitemap file. This includes the * XML prolog, the root element and namespace declaration, and the elements * leading up to the first URL entry. * * @param sitemap * the Writer to use for writing the header * @throws IOException * if there is a problem writing to the Writer */ private void writeSitemapHeader(Writer sitemap) throws IOException { sitemap.write(PROLOG); String header = "\n"; sitemap.write(header); sitemap.flush(); } /** * Write a URL entry to a single sitemap file. This includes the XML markup * surrounding a particular site URL. * * @param sitemap * the Writer to use for writing the URL * @param pid * the identifier to be written in the URL * @throws IOException * if there is a problem writing to the Writer */ private void writeSitemapEntry(Writer sitemap, String pid) throws IOException { if (sitemap != null && pid != null && entryBase != null) { StringBuffer url = new StringBuffer(); url.append(entryBase); if (!entryBase.endsWith("/")) { url.append("/"); } // URL-encode _and_ XML escape the PID. url.append(StringEscapeUtils.escapeXml( URLEncoder.encode(pid, "UTF-8")) ); sitemap.write(" "); sitemap.write(url.toString()); sitemap.write("\n"); sitemap.flush(); } } /** * Write the footer information in a single sitemap file and close the file. * This includes the closing tag for the root element. * * @param sitemap * the Writer to use for writing the footer * @throws IOException * if there is a problem writing to the Writer */ private void writeSitemapFooter(Writer sitemap) throws IOException { if (sitemap != null) { String footer = "\n"; sitemap.write(footer); sitemap.close(); } } /** * Create an index file listing all of the sitemap files that were created. * @param fileNumber the number of sitemap files that were created. */ private void writeSitemapIndex(int fileNumber) { // Open a new sitemapIndex file for writing File sitemapIndexFile = null; OutputStreamWriter sitemapIndex = null; sitemapIndexFile = new File(directory, indexFilename); try { sitemapIndex = new OutputStreamWriter(new FileOutputStream(sitemapIndexFile), Charset.forName("UTF-8")); // Write the sitemap index header for the new file sitemapIndex.write(PROLOG); String header = "\n"; sitemapIndex.write(header); sitemapIndex.flush(); // Write out one index entry for each sitemap file for (int fn = 1; fn <= fileNumber; fn++) { String filename = fileRoot + fileNumber + ".xml"; writeSitemapIndexEntry(sitemapIndex, filename); } // Write the sitemap index footer content if (sitemapIndex != null) { String footer = "\n"; sitemapIndex.write(footer); sitemapIndex.close(); } // Close the index file if (sitemapIndex != null) { sitemapIndex.close(); } } catch (IOException e) { logMetacat.warn("Could not open or write to the sitemap index file." + e.getMessage()); } } /** * Write a single line of the sitemap index file containing the URL to a specific sitemap file. * @param sitemapIndex the writer to which the index information is written * @param filename the name of the index file to be used * @throws IOException on error writing to the index file */ private void writeSitemapIndexEntry(Writer sitemapIndex, String filename) throws IOException { if (sitemapIndex != null && filename != null && locationBase != null) { StringBuffer url = new StringBuffer(); url.append(locationBase); if (!locationBase.endsWith("/")) { url.append("/"); } url.append(filename); sitemapIndex.write(" \n \n "); sitemapIndex.write(url.toString()); sitemapIndex.write("\n \n"); Date now = new Date(); SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd"); sitemapIndex.write(" "+ fmt.format(now) +"\n"); sitemapIndex.write(" \n"); sitemapIndex.flush(); } } // Member variables /** The directory in which sitemaps are written. */ private File directory; /** The root url for constructing sitemap location URLs. */ private String locationBase; /** The root url for constructing sitemap entry URLs. */ private String entryBase; /** Maximum number of URLs to write to a single sitemap file */ static final int MAX_URLS_IN_FILE = 50000; // 50,000 according to Google /** The root name to be used in naming sitemap files. */ static final String fileRoot = "sitemap"; /** The name to give to the sitemap index file */ static final String indexFilename = "sitemap_index.xml"; /** A String constant containing the XML prolog to be written in files. */ static final String PROLOG = "\n"; }