# Copyright: 2006 Regents of the University of California, # Santa Barbara Coastal LTER # http://sbcdata.lternet.edu/ # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA # 02111-1307 USA $:.unshift File.join(File.dirname(__FILE__), "..", "lib") require "eml.rb" # == What is it # Each EML document can contain DataTable elements representing(in most cases) plain # text data. The attributes of this data, such as column names, types, domain info, # are documented in the eml metadata. DataTable encapsulates data table elements from # eml documents in the instance variable @metadata. # # The purpose of this class is to provide methods to easily access metadata attributes # pertaining to the data table. It can also be extended for specific # applications to process the data based on the metadata description. # == Examples # === Get the location where this data file is stored # eml = metacat.find(:docid => 'somedoc.1.1') # data_table = eml.data_tables[0] # data_table.location # => ecogrid://knb/pisco_cbs.30.3 # === Write the data file to disk # # note we are using a block so the whole file isn't loaded to RAM # file = File.new("./tmp/#{data_table.id}") # data_table.read do |buffer| # file.write(buffer) # end # file.close class DataTable attr_reader :metadata, :eml def initialize(data_table_element, eml) @metadata = data_table_element @eml = eml end # Methods for accessing eml metadata # ---------------------------------- # pulls the docid from distribution element def docid @docid ||= location.reverse.match('[^/]+')[0].reverse end # refers to the docid function def id docid end # attribute reader for online distribution def location @location ||= @metadata.elements['physical/distribution/online/url'].text end def physical @metadata.elements['physical'] end # only supports unit bytes def size physical.elements['size'].text.to_i end def data_format physical.elements['dataFormat'].elements[1].name end def field_delimiter text_format.elements[] end def text_format physical.elements['dataFormat/textFormat'] end def simple_delimited if text_format text_format.elements['simpleDelimited'] else raise "data table is not in textFormat" end end def num_headers if text_format text_format.elements['numHeaderLines'].text.to_i else raise "data table is not in textFormat" end end def record_delimiter if text_format text_format.elements['recordDelimiter'].text else raise "data table is not in textFormat" end end def field_delimiter if simple_delimited simple_delimited.elements['fieldDelimiter'].text else raise "data table is not in simpleDelimited format" end end def columns cols = Array.new @metadata.elements.each('attributeList/attribute') do |col| cols.push col end return cols end def entity_name @metadata.elements['entityName'].text end # --------------------------- # End Metadata Access Methods # reads the dataTable text from the url or docid specified # by the physical/distribution/online/url entity def read if(location =~ /ecogrid/) #we need to pull out the docid and do a read on metacat #get self.location, and pull out the string after the last "/" uri = URI.parse(PATH_TO_METACAT) uri.query = "action=read&qformat=xml&docid=#{docid}" # Use Net:HTTP first to get the content_type http = Net::HTTP.start(uri.host, uri.port) http.request_get(uri.to_s) do |response| if(response.content_type == 'text/xml') # error message doc = REXML::Document.new(response.read_body) if(doc.root.name == 'error') raise doc.root.text else raise "Unrecognized response from metacat at #{PATH_TO_METACAT}" end elsif(response.content_type == 'text/plain') response.read_body do |f| yield f end else raise "Unrecognized content type \"#{response.content_type}\" " + "from metacat at #{PATH_TO_METACAT}" end end elsif(location =~ /http/) uri = URI.parse(location) http = Net::HTTP.start(uri.host, uri.port) http.request_get(uri.to_s) do |response| response.read_body do |f| yield f end end else raise 'Unknown location for dataTable' end end end