/** * This work was crfield name: eated" by participants in the DataONE project, and is * jointly copyrighted by participating institutions in DataONE. For * more information on DataONE, see our web site at http://dataone.org. * * Copyright 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * $Id$ */ package org.dataone.cn.indexer.resourcemap; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.math.BigInteger; import java.security.NoSuchAlgorithmException; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import org.apache.commons.lang3.StringUtils; import org.dataone.exceptions.MarshallingException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.params.ModifiableSolrParams; import org.dataone.cn.index.DataONESolrJettyTestBase; import org.dataone.cn.indexer.annotation.RdfXmlSubprocessor; import org.dataone.cn.indexer.convert.SolrDateConverter; import org.dataone.cn.indexer.solrhttp.HTTPService; import org.dataone.cn.indexer.solrhttp.SolrDoc; import org.dataone.cn.indexer.solrhttp.SolrElementField; import org.dataone.service.types.v1.AccessPolicy; import org.dataone.service.types.v1.Checksum; import org.dataone.service.types.v1.Identifier; import org.dataone.service.types.v1.NodeReference; import org.dataone.service.types.v1.ObjectFormatIdentifier; import org.dataone.service.types.v1.Permission; import org.dataone.service.types.v1.Subject; import org.dataone.service.types.v1.util.AccessUtil; import org.dataone.service.types.v1.util.ChecksumUtil; import org.dataone.service.types.v2.SystemMetadata; import org.dataone.service.util.TypeMarshaller; import org.junit.After; import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; import org.springframework.context.ApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; import org.springframework.core.io.Resource; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; /** * RDF/XML Subprocessor test for provenance field handling */ @ThreadLeakScope(ThreadLeakScope.Scope.NONE) public class RdfXmlProcessorTest extends DataONESolrJettyTestBase { /* Log it */ private static Log log = LogFactory.getLog(RdfXmlProcessorTest.class); /* the conext with provenance-specific bean definitions */ private ApplicationContext provenanceContext = null; /* The index task processor used to process tasks in the queue */ //protected IndexTaskProcessor processor; /* The task generator that adds tasks to the queue */ //private IndexTaskGenerator generator; /* the RDF/XML resource map to parse */ private Resource provAlaWaiNS02MatlabProcessing2RDF; private String provAlaWaiNS02MatlabProcessing2RDFPid = "ala-wai-ns02-matlab-processing.2.rdf"; /* The three Matlab scripts involved in the processing */ private Resource provAlaWaiNS02MatlabProcessingDataProcessor1m; private String provAlaWaiNS02MatlabProcessingDataProcessor1mPid = "ala-wai-canal-ns02-matlab-processing-DataProcessor.1.m"; private Resource provAlaWaiNS02MatlabProcessingConfigure1m; private String provAlaWaiNS02MatlabProcessingConfigure1mPid = "ala-wai-canal-ns02-matlab-processing-Configure.1.m"; private Resource provAlaWaiNS02MatlabProcessingScheduleAW02XX_001CTDXXXXR00Processing1m; private String provAlaWaiNS02MatlabProcessingScheduleAW02XX_001CTDXXXXR00Processing1mPid = "ala-wai-canal-ns02-matlab-processing-schedule_AW02XX_001CTDXXXXR00_processing.1.m"; /* The EML science metadata describing the processing */ private Resource provAlaWaiNS02MatlabProcessingEML1xml; private String provAlaWaiNS02MatlabProcessingEML1xmlPid = "ala-wai-canal-ns02-matlab-processing.eml.1.xml"; /* The input data being processed */ private Resource provAlaWaiNS02CTDData1txt; private String provAlaWaiNS02CTDData1txtPid = "ala-wai-canal-ns02-ctd-data.1.txt"; /* The processed output image */ private Resource provAlaWaiNS02ImageDataAW02XX_001CTDXXXXR00_20150203_10day1jpg; private String provAlaWaiNS02ImageDataAW02XX_001CTDXXXXR00_20150203_10day1jpgPid = "ala-wai-canal-ns02-image-data-AW02XX_001CTDXXXXR00_20150203_10day.1.jpg"; /* The eml 2.2.0 object is part of a portal */ private Resource partEml220; private String partEml220Pid = "eml-test-doc"; /* The portal object which has the part eml 220 object*/ private Resource partPortal; private String partPortalPid = "urn:uuid:b210adf0-f08a-4cae-aa86-5b64605e9297"; /* The resource map describing the hasPart/isPartOf relationship */ private Resource partResourcemap; private String partResourcemapPid = "resourcemap-with-part.xml"; /* An instance of the RDF/XML Subprocessor */ private RdfXmlSubprocessor provRdfXmlSubprocessor; /* A date converter for many date strings */ private SolrDateConverter dateConverter = new SolrDateConverter(); /* Store a map of expected Solr fields and their values for testing */ private HashMap expectedFields = new HashMap(); /* Define provenance field names used in the index */ private String WAS_DERIVED_FROM_FIELD = "prov_wasDerivedFrom"; private String WAS_GENERATED_BY_FIELD = "prov_wasGeneratedBy"; private String WAS_INFORMED_BY_FIELD = "prov_wasInformedBy"; private String USED_FIELD = "prov_used"; private String GENERATED_FIELD = "prov_generated"; private String GENERATED_BY_PROGRAM_FIELD = "prov_generatedByProgram"; private String GENERATED_BY_EXECUTION_FIELD = "prov_generatedByExecution"; private String GENERATED_BY_USER_FIELD = "prov_generatedByUser"; private String USED_BY_PROGRAM_FIELD = "prov_usedByProgram"; private String USED_BY_EXECUTION_FIELD = "prov_usedByExecution"; private String USED_BY_USER_FIELD = "prov_usedByUser"; private String WAS_EXECUTED_BY_EXECUTION_FIELD = "prov_wasExecutedByExecution"; private String WAS_EXECUTED_BY_USER_FIELD = "prov_wasExecutedByUser"; private String HAS_SOURCES_FIELD = "prov_hasSources"; private String HAS_DERIVATIONS_FIELD = "prov_hasDerivations"; private String INSTANCE_OF_CLASS_FIELD = "prov_instanceOfClass"; private static final int SLEEPTIME = 5000; @BeforeClass public static void init() { //HazelcastClientFactoryTest.setUp(); } @AfterClass public static void cleanup() throws Exception { //HazelcastClientFactoryTest.shutDown(); } /** * For each test, set up the Solr service and test data * * @throws Exception */ @Before public void setUp() throws Exception { // Start up the embedded Jetty server and Solr service super.setUp(); // load the prov context beans configureSpringResources(); // instantiate the subprocessor provRdfXmlSubprocessor = (RdfXmlSubprocessor) context.getBean("rdfXMLSubprocessor"); sendSolrDeleteAll(); } /** * For each test, clean up, bring down the Solr service */ @After public void tearDown() throws Exception { super.tearDown(); } /* * Compares the indexed provenance Solr fields to the expected fields */ protected boolean compareFields(HashMap expectedFields, InputStream resourceMap, RdfXmlSubprocessor provRdfXmlSubProcessor, String identifier, String referencedPid) throws Exception { // A map for the sub processor to populate Map docs = new TreeMap(); // Build a minimal SolrDoc keyed by the id SolrDoc solrDoc = new SolrDoc(); SolrElementField identifierField = new SolrElementField(); identifierField.setName(SolrElementField.FIELD_ID); identifierField.setValue(identifier); solrDoc.addField(identifierField); docs.put(identifier, solrDoc); // The returned map with processed Solr documents Map solrDocs = provRdfXmlSubProcessor.processDocument(identifier, docs, resourceMap); // A list of Solr fields filtered by the target object identifier System.out.println("referencedPid: " + referencedPid); Set keys = solrDocs.keySet(); for (String key : keys) { System.out.println("============= the key is " + key); } SolrDoc referencedPidDoc = solrDocs.get(referencedPid); if (referencedPidDoc == null) { fail("Failed to find a SolrDoc for referencedPid: " + referencedPid); } List fields = referencedPidDoc.getFieldList(); // compare the expected and processed fields for (SolrElementField field : fields) { String name = field.getName(); String value = field.getValue(); log.debug("Field name: " + name); String expectedValue = expectedFields.get(name); if (expectedValue != null) { List expectedValues = Arrays.asList(StringUtils.split(expectedValue, "||")); if (expectedValues != null && !expectedValues.isEmpty()) { System.out.println("Checking value:\t" + value); System.out.println("in expected: \t" + expectedValues); Assert.assertTrue(expectedValues.contains(value)); } } } return true; } /** * Test if the provenance fields in resource maps are processed correctly with the * RdfXmlsubprocessor. This test does not add content to Hazelcast or Solr. * * @throws Exception */ //@Ignore @Test public void testProvenanceFields() throws Exception { log.debug("Testing RDF/XML provenance indexing of ala-wai-ns02-matlab-processing.2.rdf: "); // Ensure fields associated with the data input objects are indexed expectedFields.clear(); expectedFields.put(USED_BY_PROGRAM_FIELD, "ala-wai-canal-ns02-matlab-processing-schedule_AW02XX_001CTDXXXXR00_processing.1.m" + "||" + "ala-wai-canal-ns02-matlab-processing-Configure.1.m" + "||" + "ala-wai-canal-ns02-matlab-processing-DataProcessor.1.m"); expectedFields .put(USED_BY_EXECUTION_FIELD, "urn:uuid:6EC8CAB7-2063-4440-BA23-364313C145FC"); expectedFields .put(INSTANCE_OF_CLASS_FIELD, "http://purl.dataone.org/provone/2015/01/15/ontology#Data"); expectedFields.put(USED_BY_USER_FIELD, "urn:uuid:D89221AD-E251-4CCB-B515-09D869DB1A61"); compareFields(expectedFields, provAlaWaiNS02MatlabProcessing2RDF.getInputStream(), provRdfXmlSubprocessor, "ala-wai-ns02-matlab-processing.2.rdf", "ala-wai-canal-ns02-ctd-data.1.txt"); // Ensure fields associated with the data output objects are indexed expectedFields.clear(); expectedFields.put(WAS_DERIVED_FROM_FIELD, "ala-wai-canal-ns02-ctd-data.1.txt"); expectedFields.put(GENERATED_BY_PROGRAM_FIELD, "ala-wai-canal-ns02-matlab-processing-schedule_AW02XX_001CTDXXXXR00_processing.1.m" + "||" + "ala-wai-canal-ns02-matlab-processing-Configure.1.m" + "||" + "ala-wai-canal-ns02-matlab-processing-DataProcessor.1.m"); expectedFields.put(GENERATED_BY_EXECUTION_FIELD, "urn:uuid:6EC8CAB7-2063-4440-BA23-364313C145FC"); expectedFields .put(GENERATED_BY_USER_FIELD, "urn:uuid:D89221AD-E251-4CCB-B515-09D869DB1A61"); expectedFields .put(INSTANCE_OF_CLASS_FIELD, "http://purl.dataone.org/provone/2015/01/15/ontology#Data"); compareFields(expectedFields, provAlaWaiNS02MatlabProcessing2RDF.getInputStream(), provRdfXmlSubprocessor, "ala-wai-ns02-matlab-processing.2.rdf", "ala-wai-canal-ns02-image-data-AW02XX_001CTDXXXXR00_20150203_10day.1.jpg"); // Ensure fields associated with program objects are indexed expectedFields.clear(); expectedFields.put(USED_FIELD, "ala-wai-canal-ns02-ctd-data.1.txt"); expectedFields.put(GENERATED_FIELD, "ala-wai-canal-ns02-image-data-AW02XX_001CTDXXXXR00_20150203_10day.1.jpg"); compareFields(expectedFields, provAlaWaiNS02MatlabProcessing2RDF.getInputStream(), provRdfXmlSubprocessor, "ala-wai-ns02-matlab-processing.2.rdf", "ala-wai-canal-ns02-matlab-processing-schedule_AW02XX_001CTDXXXXR00_processing.1.m"); // Ensure fields associated with the data input object's metadata are indexed expectedFields.clear(); expectedFields.put(HAS_DERIVATIONS_FIELD, "ala-wai-canal-ns02-image-data-AW02XX_001CTDXXXXR00_20150203_10day.1.jpg"); compareFields(expectedFields, provAlaWaiNS02MatlabProcessing2RDF.getInputStream(), provRdfXmlSubprocessor, "ala-wai-ns02-matlab-processing.2.rdf", "ala-wai-canal-ns02-ctd-data.1.txt"); // Ensure fields associated with the data output object's metadata are indexed //expectedFields.clear(); //expectedFields.put(HAS_SOURCES_FIELD, "ala-wai-canal-ns02-ctd-data.1.txt"); //compareFields(expectedFields, provAlaWaiNS02MatlabProcessing2RDF.getInputStream(), // provRdfXmlSubprocessor, "ala-wai-ns02-matlab-processing.2.rdf", // "ala-wai-canal-ns02-image-data.eml.1.xml"); } /** * Test if the hasPart/isPartOf fields in resource maps are processed correctly with the * RdfXmlsubprocessor. This test does not add content to Hazelcast or Solr. * * @throws Exception */ @Test public void testPartFields() throws Exception { log.debug("Testing RDF/XML parts indexing of resourcemap-with-part.xml: "); expectedFields.clear(); expectedFields.put("hasPart", "urn:uuid:f18812ac-7f4f-496c-82cc-3f4f54830274"); compareFields(expectedFields, partResourcemap.getInputStream(), provRdfXmlSubprocessor, "resource_map_urn:uuid:cd489c7e-be88-4d57-b13a-911b25a0b47f", null); // Ensure fields associated with the data input objects are indexed expectedFields.clear(); expectedFields.put("isPartOf", "urn:uuid:27ae3627-be62-4963-859a-8c96d940cadc"); compareFields(expectedFields, partResourcemap.getInputStream(), provRdfXmlSubprocessor, "resource_map_urn:uuid:cd489c7e-be88-4d57-b13a-911b25a0b47f", "urn:uuid:f18812ac-7f4f-496c-82cc-3f4f54830274"); } /** * Test the end to end index processing of a resource map with provenance statements * * @throws Exception */ //@Ignore @Test public void testInsertProvResourceMap() throws Exception { String script1 = provAlaWaiNS02MatlabProcessingDataProcessor1mPid; indexObjectToSolr(script1, provAlaWaiNS02MatlabProcessingDataProcessor1m); String script2 = provAlaWaiNS02MatlabProcessingConfigure1mPid; indexObjectToSolr(script2,provAlaWaiNS02MatlabProcessingConfigure1m); String script3 = provAlaWaiNS02MatlabProcessingScheduleAW02XX_001CTDXXXXR00Processing1mPid; indexObjectToSolr(script3, provAlaWaiNS02MatlabProcessingScheduleAW02XX_001CTDXXXXR00Processing1m); // Insert the EML file into the task queue String emlDoc = provAlaWaiNS02MatlabProcessingEML1xmlPid; indexObjectToSolr(emlDoc, provAlaWaiNS02MatlabProcessingEML1xml); // Insert the output image into the task queue String jpgImage = provAlaWaiNS02ImageDataAW02XX_001CTDXXXXR00_20150203_10day1jpgPid; indexObjectToSolr(jpgImage, provAlaWaiNS02ImageDataAW02XX_001CTDXXXXR00_20150203_10day1jpg); // Insert the CTD data into the task queue String ctdData = provAlaWaiNS02CTDData1txtPid; indexObjectToSolr(ctdData, provAlaWaiNS02CTDData1txt); Thread.sleep(SLEEPTIME); // now process the tasks //processor.processIndexTaskQueue(); // Insert the resource map into the task queue String resourceMap = provAlaWaiNS02MatlabProcessing2RDFPid; //formatId = "http://www.openarchives.org/ore/terms"; indexObjectToSolr(resourceMap, provAlaWaiNS02MatlabProcessing2RDF); Thread.sleep(SLEEPTIME); // now process the tasks //processor.processIndexTaskQueue(); Thread.sleep(SLEEPTIME); Thread.sleep(SLEEPTIME); Thread.sleep(SLEEPTIME); // ensure everything indexed properly assertPresentInSolrIndex(script1); assertPresentInSolrIndex(script2); assertPresentInSolrIndex(script3); assertPresentInSolrIndex(emlDoc); assertPresentInSolrIndex(jpgImage); assertPresentInSolrIndex(ctdData); assertPresentInSolrIndex(resourceMap); } /** * Test the end to end index processing of a resource map with provenance statements * * @throws Exception */ //@Ignore @Test public void testInsertPartsResourceMap() throws Exception { // Insert the EML file into the task queue String emlId = partEml220Pid; //formatId = "https://eml.ecoinformatics.org/eml-2.2.0"; indexObjectToSolr(emlId, partEml220); // Insert the portal document into the task queue String portalId = partPortalPid; //formatId = "https://purl.dataone.org/portals-1.0.0"; String serieId = "urn:uuid:27ae3627-be62-4963-859a-8c96d940cadc"; indexObjectToSolr(portalId, partPortal); Thread.sleep(SLEEPTIME); // now process the tasks //processor.processIndexTaskQueue(); // Insert the resource map into the task queue String resourceMapId = "resource_map_urn:uuid:cd489c7e-be88-4d57-b13a-911b25a0b47f"; //formatId = "http://www.openarchives.org/ore/terms"; //The following statement was commented before we created the dataone-indexer project //insertResource(resourceMapId, formatId, partResourcemap, nodeid, userDN); Thread.sleep(SLEEPTIME); Thread.sleep(SLEEPTIME); assertPresentInSolrIndex(emlId); assertPresentInSolrIndex(portalId); //assertPresentInSolrIndex(resourceMapId); assertTrue(compareFieldValue(emlId, "title", "EML Annotation Example")); } /** * Default test - is JUnit working as expected? */ @Test public void testInit() { Assert.assertTrue(1 == 1); } /* Load the indexer and provenance context beans */ protected void configureSpringResources() throws IOException { // Instantiate the generator and processor from the test-context beans //processor = (IndexTaskProcessor) context.getBean("indexTaskProcessor"); //generator = (IndexTaskGenerator) context.getBean("indexTaskGenerator"); // instantiate the RDF resource to be tested if (provenanceContext == null) { provenanceContext = new ClassPathXmlApplicationContext( "org/dataone/cn/indexer/resourcemap/test-context-provenance.xml"); } provAlaWaiNS02MatlabProcessing2RDF = (Resource) provenanceContext .getBean("provAlaWaiNS02MatlabProcessing2RDF"); provAlaWaiNS02MatlabProcessingDataProcessor1m = (Resource) provenanceContext .getBean("provAlaWaiNS02MatlabProcessingDataProcessor1m"); provAlaWaiNS02MatlabProcessingConfigure1m = (Resource) provenanceContext .getBean("provAlaWaiNS02MatlabProcessingConfigure1m"); provAlaWaiNS02MatlabProcessingScheduleAW02XX_001CTDXXXXR00Processing1m = (Resource) provenanceContext .getBean("provAlaWaiNS02MatlabProcessingScheduleAW02XX_001CTDXXXXR00Processing1m"); provAlaWaiNS02MatlabProcessingEML1xml = (Resource) provenanceContext .getBean("provAlaWaiNS02MatlabProcessingEML1xml"); provAlaWaiNS02CTDData1txt = (Resource) provenanceContext .getBean("provAlaWaiNS02CTDData1txt"); provAlaWaiNS02ImageDataAW02XX_001CTDXXXXR00_20150203_10day1jpg = (Resource) provenanceContext .getBean("provAlaWaiNS02ImageDataAW02XX_001CTDXXXXR00_20150203_10day1jpg"); partEml220 = (Resource) provenanceContext.getBean("eml220TestDocSciMeta"); partPortal = (Resource) provenanceContext.getBean("portalTestDoc"); partResourcemap = (Resource) provenanceContext.getBean("partResourceMap"); } }