From 65bb83fd7739d927ed970c4b7555dc01b9857c69 Mon Sep 17 00:00:00 2001
From: Anusha Ranganathan <anusha@cottagelabs.com>
Date: Thu, 5 Dec 2024 19:04:39 +0100
Subject: [PATCH] Verify CRC1280 import task

---
 .../services/prepare_csv_from_crc_folder.rb   |   3 +-
 .../app/services/verify_crc_dataset_import.rb | 224 ++++++++++++++++++
 hyrax/lib/tasks/verify_crc_1280_import.rake   |  20 ++
 3 files changed, 246 insertions(+), 1 deletion(-)
 create mode 100644 hyrax/app/services/verify_crc_dataset_import.rb
 create mode 100644 hyrax/lib/tasks/verify_crc_1280_import.rake

diff --git a/hyrax/app/services/prepare_csv_from_crc_folder.rb b/hyrax/app/services/prepare_csv_from_crc_folder.rb
index a39d1a47..add3540b 100644
--- a/hyrax/app/services/prepare_csv_from_crc_folder.rb
+++ b/hyrax/app/services/prepare_csv_from_crc_folder.rb
@@ -321,6 +321,7 @@ class PrepareCsvFromCrcFolder
     # fix incorrect spelling in import data
     file_attributes['Modality'] = correct_spelling_of_modality(file_attributes['Modality'])
 
+    experiment_title = [file_attributes['Experiment title'], file_attributes['DublinCore-Title'], file_attributes['DataCite-Title']].find(&:present?)
     experiment_description = [file_attributes['Experiment Description'], file_attributes['DublinCore-Description'], file_attributes['DataCite-Description']].find(&:present?)
 
     [
@@ -332,7 +333,7 @@ class PrepareCsvFromCrcFolder
       [meta_info[:parent_work_source_identifier], meta_info[:parent_group_source_identifier]].compact.join(';'),
       '',
       '',
-      file_attributes['Experiment title'],
+      experiment_title,
       file_attributes['Contributor'],
       file_attributes['DataCite-contributorNameIdentifier'],
       file_attributes['DataCite-contributorType'],
diff --git a/hyrax/app/services/verify_crc_dataset_import.rb b/hyrax/app/services/verify_crc_dataset_import.rb
new file mode 100644
index 00000000..5c15153d
--- /dev/null
+++ b/hyrax/app/services/verify_crc_dataset_import.rb
@@ -0,0 +1,224 @@
+require 'json'
+require 'csv'
+
+class VerifyCRCDatasetImport
+  def self.verify_import(import_id)
+    importer = ::Bulkrax::Importer.find(import_id)
+    work_entries = importer.entries.select {|e| e.class.name == "Bulkrax::CrcFolderEntry"}
+    import_status = true
+    report_paths = []
+    work_entries.each do |work_entry|
+      verification_status, file_path = self.verify_experiment_and_report(import_id, work_entry.id)
+      import_status = import_status && verification_status
+      report_paths.append(file_path)
+    end
+    return import_status, report_paths
+  end
+
+  def self.verify_experiment_and_report(import_id, entry_id)
+    verification_status, message, paths_compared, analysis = self.verify_experiment(entry_id, import_id)
+
+    # prepare csv report
+    report_name = "analysis_report_#{import_id}_#{entry_id}_#{Time.now.strftime("%m-%d_%H-%M-%S")}.csv"
+    report_path = File.join(ENV.fetch('DOWNLOAD_PATH', '/shared/downloads'), report_name)
+    headers = %w(id, status, message)
+
+    CSV.open(report_path, "w") do |csv|
+      # Write headers
+      csv << headers
+
+      # Write verification status
+      csv << [JSON.pretty_generate(paths_compared), verification_status, message]
+
+      # Write per file status
+      if analysis
+        # only in input
+        analysis.fetch(:only_in_input, []).each do |i|
+          csv << [i, false, "Only in input"]
+        end
+
+        # Only in work
+        analysis.fetch(:only_in_work, []).each do |i|
+          csv << [i, false, "Only in work"]
+        end
+
+        # in_both - size_analysis
+        analysis[:in_both_size_analysis].each do |fn_hash|
+          csv << [fn_hash[:key], fn_hash[:result], fn_hash[:message]]
+        end
+      end
+    end
+    return verification_status, report_path
+  end
+
+  def self.verify_experiment(entry_id, import_id)
+    # gather paths compared
+    paths_compared = {
+      entry_id: entry_id,
+      import_id: import_id
+    }
+
+    # Find importer entry
+    begin
+      entry = ::Bulkrax::CrcFolderEntry.find(entry_id)
+    rescue
+      msg = "Error: Import entry #{entry_id} not found"
+      return false, msg, paths_compared, nil
+    end
+    paths_compared[source_identifier: entry.identifier]
+
+    # Check import id
+    if import_id != entry.importerexporter_id
+      msg = "Error: Importer id #{import_id} does not match id associated with entry #{entry_id}"
+      return false, msg, paths_compared, nil
+    end
+
+    # Get importer
+    begin
+      importer = ::Bulkrax::Importer.find(import_id)
+    rescue
+      msg = "Error: Importer #{import_id} associated with entry #{entry_id} not found"
+      return false, msg, paths_compared, nil
+    end
+
+    # Get base import dir from importer
+    import_base_dir = importer.parser_fields['import_file_path']
+    paths_compared[:import_base_dir] = import_base_dir
+
+    # Get import path
+    import_folder_name = entry.raw_metadata['folder_name']
+    # NOTE: This has the risk of getting the path wrong if the glob finds multiple directories
+    wildcard_import_path = File.join(import_base_dir, '*', import_folder_name)
+    paths_found = Dir.glob(wildcard_import_path)
+    if paths_found.empty?
+      msg = "The import directory #{wildcard_import_path} was not found"
+      return false, msg, paths_compared, nil
+    elsif paths_found.size > 1
+      msg = "The import directory #{wildcard_import_path} matched multiple"
+      return false, msg, paths_compared, nil
+    end
+    import_path = paths_found[0]
+    import_path =  import_path + '/' unless import_path.end_with?('/')
+    paths_compared[:import_path] = import_path
+
+    # Check import path exists
+    unless Dir.exist?(import_path)
+      msg = "The import directory #{import_path} was not found"
+      return false, msg, paths_compared, nil
+    end
+
+    # Check import path is not empty
+    if Dir.empty?(import_path)
+      msg = ["The import directory #{import_path} is empty"]
+      return false, msg, paths_compared, nil
+    end
+
+    # Get CrcDataset for the entry
+    works = CrcDataset.where(source: entry.identifier)
+    unless works.first.present?
+      msg = "The CrcDataset with source identifier #{entry.identifier} was not found"
+      return false, msg, paths_compared, nil
+    end
+    paths_compared[:work_id] = works.first.id
+
+    # Get files to compare
+
+    # -- Get list of files from import directory
+    input_list = Dir.glob(File.join(import_path, '**', '*')).
+      reject {|fn| File.directory?(fn) }.
+      reject{|fn| self.restricted_file_names?(fn.split('/')[-1])}.
+      map{ |fn| fn.sub(import_path, '')}.sort
+    # ---- downcase folder name
+    sanitised_input_list = {}
+    input_list.each do |fp|
+      sanitised_input_list[fp] = fp
+      all_parts = fp.split('/')
+      if all_parts.size > 1
+        parts = all_parts[0...-1]
+        parts.map {|p| p.downcase!}
+        new_path = File.join(parts.join("/"), all_parts[-1])
+        sanitised_input_list[fp] = new_path
+      end
+    end
+    # ---- insert session folder
+    max_depth = 0
+    input_list.each do |fp|
+      parts = fp.split('/')
+      max_depth = parts.size if parts.size > max_depth
+    end
+    if max_depth == 3
+      sanitised_input_list.each do |fp, new_path|
+        parts = new_path.split('/')
+        if parts.size == 3
+          sanitised_input_list[fp] = File.join(parts[0], 'ses 01', parts[1], parts[2])
+        end
+      end
+    end
+
+    # -- Get list of objects from S3 for the CrcDataset
+    s3 = S3StorageService.new()
+    s3.init_client
+    bucket_id = s3.sanitise_name(works.first.id)
+    list_of_work_objects, _work_total_size = s3.list_all_objects(bucket_id)
+    list_of_works =  list_of_work_objects.map{|a| a[:key]}.
+      reject{|fn| self.restricted_file_names?(fn.split('/')[-1])}.sort
+
+    # Analyse files
+    in_both = sanitised_input_list.values & list_of_works
+    only_in_input = sanitised_input_list.values - list_of_works
+    only_in_work = list_of_works - sanitised_input_list.values
+
+    # Analyse file size
+    size_analysis = []
+    sizes_match = true
+    in_both.each do |fn|
+      original_fn = sanitised_input_list.key(fn)
+      fp = File.join(import_path, original_fn)
+      input_size = File.size(fp)
+      selected_work = list_of_work_objects.find{|a| a[:key] == fn}
+      s3_size = selected_work.fetch(:size, 0)
+      if input_size == s3_size
+        message = "Size is the same - #{input_size}"
+      elsif input_size > s3_size
+        sizes_match = false
+        message = "Input file size #{input_size} is greater than object in S3 #{s3_size}"
+      else
+        sizes_match = false
+        message = "Input file size #{input_size} is smaller than object in S3 #{s3_size}"
+      end
+      size_analysis.append({
+                           key: fn,
+                           input_size: input_size,
+                           s3_size: s3_size,
+                           result: input_size == s3_size,
+                           message: message
+                         })
+    end
+
+    # Get verification status
+    verification_status = "Unknown"
+    if only_in_input.empty? and only_in_work.empty? and sizes_match
+      verification_status = true
+    end
+    if only_in_input.present? or only_in_work.present? or !sizes_match
+      verification_status = false
+    end
+
+    # gather the analysis
+    analysis = {
+      only_in_input: only_in_input,
+      only_in_work: only_in_work,
+      in_both_size_analysis: size_analysis
+    }
+
+    # return
+    return verification_status, "Comparison completed", paths_compared, analysis
+  end
+
+  def self.restricted_file_names?(file_name)
+    parsed_file =  YAML.load_file(Rails.root.join('config', 'restricted_files.yml'))
+    parsed_file['restricted_file_names'].append('system_metadata.json')
+    parsed_file['restricted_file_names'].map(&:downcase).include?(file_name.downcase)
+  end
+
+end
diff --git a/hyrax/lib/tasks/verify_crc_1280_import.rake b/hyrax/lib/tasks/verify_crc_1280_import.rake
new file mode 100644
index 00000000..3297ccb5
--- /dev/null
+++ b/hyrax/lib/tasks/verify_crc_1280_import.rake
@@ -0,0 +1,20 @@
+require 'json'
+
+namespace :rdms do
+  namespace :crc_1280_import do
+    desc 'Verify CRC1280 import or verify CRC1280 import of an experiment. This will compare the files in the file path with the objects in the S3 bucket"]'
+    task :"verify_import", [:import_id] => :environment do |task, args|
+      puts "Verifying import #{args.import_id}"
+      import_status, report_paths = VerifyCRCDatasetImport.verify_import(args.import_id)
+      puts "import status : #{import_status}"
+      puts "Detailed reports are available at:"
+      puts JSON.pretty_generate(report_paths)
+    end
+    task :"verify_experiment", [:import_id, :entry_id] => :environment do |task, args|
+      puts "Verifying import #{args.import_id}"
+      experiment_status, report_path = VerifyCRCDatasetImport.verify_experiment_and_report(args.import_id, args.entry_id)
+      puts "Experiment import status : #{experiment_status}"
+      puts "Detailed reports is available at: #{report_path}"
+    end
+  end
+end
-- 
GitLab