/** * '$RCSfile$' * Copyright: 2000-2019 Regents of the University of California and the * National Center for Ecological Analysis and Synthesis * * '$Author: $' * '$Date: $' * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package edu.ucsb.nceas.metacat.doi.datacite; import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.xpath.XPathExpressionException; import org.apache.log4j.Logger; import org.apache.wicket.protocol.http.mock.MockHttpServletRequest; import org.dataone.service.exceptions.InvalidRequest; import org.dataone.service.exceptions.InvalidToken; import org.dataone.service.exceptions.NotAuthorized; import org.dataone.service.exceptions.NotFound; import org.dataone.service.exceptions.NotImplemented; import org.dataone.service.exceptions.ServiceFailure; import org.dataone.service.types.v1.Identifier; import org.dataone.service.types.v1.Session; import org.dataone.service.types.v1.Subject; import org.dataone.service.types.v2.Node; import org.dataone.service.types.v2.SystemMetadata; import org.ecoinformatics.datamanager.parser.DataPackage; import org.ecoinformatics.datamanager.parser.Party; import org.ecoinformatics.datamanager.parser.UserId; import org.ecoinformatics.datamanager.parser.generic.DataPackageParserInterface; import org.ecoinformatics.datamanager.parser.generic.Eml200DataPackageParser; import org.w3c.dom.Document; import edu.ucsb.nceas.ezid.profile.DataCiteProfileResourceTypeValues; import edu.ucsb.nceas.metacat.dataone.MNodeService; /** * A factory to generate data cite meta data for the scientific meta data standards - eml-2.* * @author tao * */ public class EML2DataCiteFactory extends DataCiteMetadataFactory { private static Logger logMetacat = Logger.getLogger(EML2DataCiteFactory.class); /** * Determine if the given name space can be handled by this factory */ @Override public boolean canProcess(String namespace) { boolean can = false; if(namespace != null && namespace.startsWith("eml://ecoinformatics.org/eml-2")) { can = true; } logMetacat.debug("EML2DataCitFactory.canProcess - If this factory can process the xml with the name space " + namespace + "? " + can); return can; } /** * Method to generate the data cite xml document */ @Override public String generateMetadata(Identifier identifier, SystemMetadata sysmeta) throws InvalidRequest, ServiceFailure { if(identifier != null && sysmeta != null) { try { DataPackage emlPackage = getEMLPackage(sysmeta); if (emlPackage != null) { String language = emlPackage.getLanguage(); Document doc = generateROOTDoc(); //identifier String scheme = DOI; String id = removeIdSchemePrefix(identifier.getValue(), scheme); addIdentifier(doc, id, scheme); //creator appendCreators(sysmeta.getRightsHolder(), emlPackage, doc); //title String title = emlPackage.getTitle(); if(title == null || title.trim().equals("")) { throw new InvalidRequest(INVALIDCODE, "The datacite instance must have a title. It can't be null or blank"); } appendTitle(title, doc, language); //publisher String publisher = lookupPublisher(emlPackage); addPublisher(doc, publisher); //publication year String year = lookupPublishingYear(emlPackage, sysmeta); addPublicationYear(doc, year); //subjects (keywords) List subjects = emlPackage.getKeywords(); if(subjects != null) { for(String subject : subjects) { appendSubject(subject, doc, language); } } //language addLanguage(doc, language); //resource type //String resourceType = lookupResourceType(sysmeta); String resourceType = null; //only set the attribute to "dataset" addResourceType(doc, DataCiteProfileResourceTypeValues.DATASET.toString(), resourceType); //version //description (abstract) String description = emlPackage.getAbstract(); if(description != null) { appendDescription(description, doc, language, ABSTRACT); } //size // format String format = lookupFormat(sysmeta); if(format != null) { appendFormat(doc, format); } return serializeDoc(doc); } else { throw new ServiceFailure("1030", "Metacat can't parse the eml object " + identifier.getValue() + " so we can't get the needed information from it."); } } catch (InvalidRequest e) { throw e; } catch (Exception e) { e.printStackTrace(); throw new ServiceFailure("1030", e.getMessage()); } } else { return null; } } /** * Get a parsed eml2 data package if it is an eml 2 document * @param sysMeta * @return null if it is not an eml document. * @throws Exception */ private DataPackage getEMLPackage(SystemMetadata sysMeta) throws Exception{ DataPackage dataPackage = null; if (sysMeta.getFormatId().getValue().startsWith("eml://")) { DataPackageParserInterface parser = new Eml200DataPackageParser(); // for using the MN API as the MN itself MockHttpServletRequest request = new MockHttpServletRequest(null, null, null); Session session = new Session(); Subject subject = MNodeService.getInstance(request).getCapabilities().getSubject(0); session.setSubject(subject); InputStream emlStream = MNodeService.getInstance(request).get(session, sysMeta.getIdentifier()); parser.parse(emlStream); dataPackage = parser.getDataPackage(); } return dataPackage; } /** * Append the creator information to the datacite document * According to https://ezid.cdlib.org/doc/apidoc.html#profile-datacite * Each name may be a corporate, institutional, or personal name. In personal names list family name before given name, as in: * Shakespeare, William * @param subject * @return fullName if found * @throws ServiceFailure * @throws NotAuthorized * @throws NotImplemented * @throws NotFound * @throws InvalidToken * @throws XPathExpressionException */ private void appendCreators(Subject subject, DataPackage emlPackage, Document doc) throws InvalidRequest, ServiceFailure, NotAuthorized, NotImplemented, NotFound, InvalidToken, XPathExpressionException { String nameSep =", "; List parties = emlPackage.getCreators(); if (parties == null || parties.isEmpty()) { throw new InvalidRequest(INVALIDCODE, "The datacite instance must have a creator. It can't be null or blank"); } boolean found = false; for(Party party : parties) { String surName = party.getSurName(); String positionName = party.getPositionName(); String organization = party.getOrganization(); String fullName = null; if(surName != null && !surName.trim().equals("")) { //this is a person List givenNames = party.getGivenNames(); //System.out.println("the surname ============== "+surName); fullName = surName; if(givenNames!=null && givenNames.size() > 0 && givenNames.get(0) != null && !givenNames.get(0).trim().equals("")) { fullName = fullName + nameSep + givenNames.get(0); } } else if(positionName != null && !positionName.trim().equals("")) { //it has a positionName and we will use the positionName as the full name (it doesn't have the individual name) fullName = positionName; } else { //organization name fullName=organization; //organization is the creator. organization = null; //affiliation is null } String nameIdentifier = null; String nameIdentifierSchemeURI = null; String nameIdentifierScheme = null; List userIds = party.getUserIdList(); if(userIds != null && !userIds.isEmpty()) { UserId userId = userIds.get(0);//nameIdentifier only can happen at most once. So only choose the first one. if(userId != null) { String value = userId.getValue(); String directory = userId.getDirectory(); if(directory != null && (directory.startsWith("https://orcid.org") || directory.startsWith("http://orcid.org"))) { nameIdentifierScheme = "ORCID"; if(!directory.endsWith("/")) { directory = directory+"/"; } nameIdentifierSchemeURI = directory; if(value.indexOf(nameIdentifierSchemeURI) > -1) { nameIdentifier = value.replaceFirst(nameIdentifierSchemeURI, "");//get rid of nameIdentifierSchemeURI from the id. } else { nameIdentifier = value; } } else { nameIdentifierScheme = directory; nameIdentifier = value; } } } appendCreator(fullName, doc, organization, nameIdentifier, nameIdentifierSchemeURI, nameIdentifierScheme); found = true; } if(!found) { throw new InvalidRequest(INVALIDCODE, "The datacite instance must have a creator. It can't be null or blank"); } } /** * Format the publishing year. First, it will look the pubDate of the eml package. * If the format of the pubDate is yyyy or yyyy-mm-dd, it will get the yyyy from the pubDate. * If it can't get the value from pubDate, it falls back to get from the upload date from the system metadata. * If it still can't get anything, an exception will be thrown. * @param emlPackage * @param sysMeta * @return the publishing year of this package * @throws InvalidRequest */ String lookupPublishingYear(DataPackage emlPackage, SystemMetadata sysMeta) throws InvalidRequest { String publishYear = null; String pubDate = emlPackage.getPubDate(); if(pubDate != null && !pubDate.trim().equals("")) { String regex = "^\\d{4}"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(pubDate); if (matcher.matches()) { publishYear = pubDate; return publishYear; } else { regex = "^\\d{4}-\\d{2}-\\d{2}"; pattern = Pattern.compile(regex); matcher = pattern.matcher(pubDate); if (matcher.matches()) { publishYear = pubDate.substring(0, 4); return publishYear; } } } if(publishYear == null) { //fall back to use system meta data SimpleDateFormat sdf = new SimpleDateFormat("yyyy"); publishYear = sdf.format(sysMeta.getDateUploaded()); } if(publishYear == null || publishYear.trim().equals("")) { throw new InvalidRequest(INVALIDCODE, "The datacite instance must have the publishing year. Metacat looked the eml object and the system meta data. But they are blank."); } return publishYear; } /** * Format a publisher. First, it looks at the eml package's publisher element. * If it can't find anything, it falls back to look the name of the member node. * If it still can't find anything, an exception will be thrown. * @param emlPackage * @return the publisher of the package * @throws ServiceFailure * @throws InvalidRequest */ String lookupPublisher(DataPackage emlPackage) throws ServiceFailure, InvalidRequest { String publisherStr = ""; Party publisher = emlPackage.getPublisher(); if(publisher != null) { String givenNameDelimiter = ", "; String delimiter = ". "; String surName = publisher.getSurName(); List givenNames = publisher.getGivenNames(); String positionName = publisher.getPositionName(); String organizationName = publisher.getOrganization(); if(surName != null && !surName.trim().equals("")) { publisherStr = publisherStr + surName; } if(givenNames != null && givenNames.get(0) != null && !givenNames.get(0).trim().equals("")) { publisherStr = publisherStr + givenNameDelimiter + givenNames.get(0); } if(positionName != null && !positionName.trim().equals("")) { if(!publisherStr.trim().equals("")) { //already has the individual name, so we need to add the delimiter. publisherStr = publisherStr + delimiter; } publisherStr = publisherStr + positionName; } if(organizationName != null && !organizationName.trim().equals("")) { if(!publisherStr.trim().equals("")) { //already has something, so we need to add the delimiter. publisherStr = publisherStr + delimiter; } publisherStr = publisherStr + organizationName; } } if(publisherStr == null || publisherStr.trim().equals("")) { //fall back to use the name of the member node Node node = MNodeService.getInstance(null).getCapabilities(); publisherStr = node.getName(); } if(publisherStr == null || publisherStr.trim().equals("")) { throw new InvalidRequest(INVALIDCODE, "The datacite instance must have a publisher. Metacat looked the eml object and member node name. But they are blank."); } return publisherStr; } }