/** * '$RCSfile$' * Copyright: 2007 Regents of the University of California and the * National Center for Ecological Analysis and Synthesis * * '$Author: jones $' * '$Date: 2013-05-23 20:26:48 +0000 (Thu, 23 May 2013) $' * '$Revision: 7766 $' * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package edu.ucsb.nceas.metacat; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.TimerTask; import org.apache.log4j.Logger; import edu.ucsb.nceas.metacat.database.DBConnection; import edu.ucsb.nceas.metacat.database.DBConnectionPool; import edu.ucsb.nceas.metacat.properties.PropertyService; import edu.ucsb.nceas.utilities.PropertyNotFoundException; /** * A Sitemap represents a document that lists all of the content of the Metacat * server for use by harvesting spiders that wish to index the contents of the * Metacat site. It is used to generate an XML representation of all of the URLs * of the site in order to facilitate indexing of the metacat site by search * engines. * * @author Matt Jones */ public class Sitemap extends TimerTask { private static Logger logMetacat = Logger.getLogger(Sitemap.class); /** * Construct a new instance of the Sitemap. * * @param directory * the location to store sitemap files * @param urlRoot * the base URL for constructing sitemap URLs * @param skin * the format skin to be used in URLs */ public Sitemap(File directory, String urlRoot, String skin) { super(); this.directory = directory; this.urlRoot = urlRoot; this.skin = skin; } /** * Execute the timed task when called, in this case by generating the * sitemap files needed for this Metacat instance. */ public void run() { generateSitemaps(); } /** * Generate all of the sitemap files needed to list the URLs from this * instance of Metacat, using the open sitemap format described here: * http://www.sitemaps.org/protocol.html * URLs are written to a single file, unless the maximum number of URLs * allowed in the sitemap file is exceeded, in which subsequent numbered * files are created. An index of the sitemaps is also created. * * The sitemap index can be registered with search index providers such as * Google, but beware that it needs to be accessible in a location above the * mount point for the service URLs. By default the files are placed in * {context}/sitemaps, but you will need to expose them at {context}/ for * them to be trusted by Google. See the Sitemaps.org documentation for * details. * * @param directory * an existing File directory in which to write the sitemaps * @param urlRoot * the base URL to use in constructing document URLs * @param skin * the name of the skin to be used in formatting metacat * documents */ public void generateSitemaps() { logMetacat.info("Running the Sitemap task."); // Test if the passed in File is a directory if (directory.isDirectory()) { // Query xml_documents to get list of documents StringBuffer query = new StringBuffer(); // TODO: make the doctype configurable in the query String sql = "SELECT xml_documents.docid, xml_documents.rev " + "FROM xml_documents, xml_access, identifier " + "WHERE xml_documents.doctype LIKE 'eml:%' " + "AND xml_documents.docid = identifier.docid " + "AND xml_documents.rev = identifier.rev " + "AND identifier.guid = xml_access.guid " + "AND xml_access.principal_name = 'public' " + "AND xml_access.perm_type = 'allow' " + "order by docid, rev"; query.append(sql); DBConnection dbConn = null; int serialNumber = -1; try { // Get a database connection from the pool dbConn = DBConnectionPool .getDBConnection("Sitemap.generateSitemap()"); serialNumber = dbConn.getCheckOutSerialNumber(); // Execute the query statement PreparedStatement stmt = dbConn.prepareStatement(query.toString()); stmt.execute(); ResultSet rs = stmt.getResultSet(); // Loop through all of the documents, and write them to a // sitemap File sitemapFile = null; OutputStreamWriter sitemap = null; int counter = 0; int fileNumber = 0; while (rs.next()) { // Check if a new sitemap file needs to be created if (counter % MAX_URLS_IN_FILE == 0) { // if a sitemap file is already open if (sitemapFile != null && sitemapFile.canWrite()) { // write the footer and close the file writeSitemapFooter(sitemap); } // Open a new sitemap file for writing fileNumber++; sitemapFile = new File(directory, fileRoot + fileNumber + ".xml"); sitemap = new OutputStreamWriter(new FileOutputStream(sitemapFile), Charset.forName("UTF-8")); // Write the sitemap document header for the new file writeSitemapHeader(sitemap); } String separator = PropertyService.getProperty("document.accNumSeparator"); String docid = rs.getString(1) + separator + rs.getString(2); writeSitemapEntry(sitemap, docid); counter++; } stmt.close(); writeSitemapFooter(sitemap); writeSitemapIndex(fileNumber); } catch (SQLException e) { logMetacat.warn("Error while writing to the sitemap file: " + e.getMessage()); } catch (IOException ioe) { logMetacat.warn("Could not open or write to the sitemap file." + ioe.getMessage()); } catch (PropertyNotFoundException pnfe) { logMetacat.warn("Could not retrieve the account number separator." + pnfe.getMessage()); } finally { // Return database connection to the pool DBConnectionPool.returnDBConnection(dbConn, serialNumber); } } else { logMetacat.warn("Sitemap not created because directory not valid."); } } /** * Write the header information in a single sitemap file. This includes the * XML prolog, the root element and namespace declaration, and the elements * leading up to the first URL entry. * * @param sitemap * the Writer to use for writing the header * @throws IOException * if there is a problem writing to the Writer */ private void writeSitemapHeader(Writer sitemap) throws IOException { sitemap.write(PROLOG); String header = "\n"; sitemap.write(header); sitemap.flush(); } /** * Write a URL entry to a single sitemap file. This includes the XML markup * surrounding a particular site URL. * * @param sitemap * the Writer to use for writing the URL * @param docid * the identifier to be written in the URL * @param urlRoot * the base URL to be used in constructing a URL * @param skin * the name of the skin to be used in constructing a URL * @throws IOException * if there is a problem writing to the Writer */ private void writeSitemapEntry(Writer sitemap, String docid) throws IOException { if (sitemap != null && docid != null && urlRoot != null) { StringBuffer url = new StringBuffer(); url.append(urlRoot); if (!urlRoot.endsWith("/")) { url.append("/"); } url.append(docid); if (skin != null) { url.append("/"); url.append(skin); } sitemap.write(""); sitemap.write(url.toString()); sitemap.write(""); // 2005-01-01 // monthly // 0.8 sitemap.write(""); sitemap.write("\n"); sitemap.flush(); } } /** * Write the footer information in a single sitemap file and close the file. * This includes the closing tag for the root element. * * @param sitemap * the Writer to use for writing the footer * @throws IOException * if there is a problem writing to the Writer */ private void writeSitemapFooter(Writer sitemap) throws IOException { if (sitemap != null) { String footer = "\n"; sitemap.write(footer); sitemap.close(); } } /** * Create an index file listing all of the sitemap files that were created. * @param fileNumber the number of sitemap files that were created. */ private void writeSitemapIndex(int fileNumber) { // Open a new sitemapIndex file for writing File sitemapIndexFile = null; OutputStreamWriter sitemapIndex = null; sitemapIndexFile = new File(directory, indexFilename); try { sitemapIndex = new OutputStreamWriter(new FileOutputStream(sitemapIndexFile), Charset.forName("UTF-8")); // Write the sitemap index header for the new file sitemapIndex.write(PROLOG); String header = "\n"; sitemapIndex.write(header); sitemapIndex.flush(); // Write out one index entry for each sitemap file for (int fn = 1; fn <= fileNumber; fn++) { String filename = fileRoot + fileNumber + ".xml"; writeSitemapIndexEntry(sitemapIndex, filename); } // Write the sitemap index footer content if (sitemapIndex != null) { String footer = "\n"; sitemapIndex.write(footer); sitemapIndex.close(); } // Close the index file if (sitemapIndex != null) { sitemapIndex.close(); } } catch (IOException e) { logMetacat.warn("Could not open or write to the sitemap index file." + e.getMessage()); } } /** * Write a single line of the sitemap index file containing the URL to a specific sitemap file. * @param sitemapIndex the writer to which the index information is written * @param filename the name of the index file to be used * @throws IOException on error writing to the index file */ private void writeSitemapIndexEntry(Writer sitemapIndex, String filename) throws IOException { if (sitemapIndex != null && filename != null && urlRoot != null) { StringBuffer url = new StringBuffer(); url.append(urlRoot); if (!urlRoot.endsWith("/")) { url.append("/"); } url.append(filename); sitemapIndex.write(""); sitemapIndex.write(url.toString()); sitemapIndex.write(""); Date now = new Date(); SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd"); sitemapIndex.write(""+ fmt.format(now) +""); sitemapIndex.write(""); sitemapIndex.write("\n"); sitemapIndex.flush(); } } // Member variables /** The directory in which sitemaps are written. */ private File directory; /** The root url for constructing sitemap URLs. */ private String urlRoot; /** The name of the format skin to be used in sitemap URLs. */ private String skin; /** Maximum number of URLs to write to a single sitemap file */ static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google /** The root name to be used in naming sitemap files. */ static final String fileRoot = "metacat"; /** The name to give to the sitemap index file */ static final String indexFilename = "metacatSitemapIndex.xml"; /** A String constant containing the XML prolog to be written in files. */ static final String PROLOG = "\n"; }