From c79d264328e91367bcb5d077176bd9a395d4ad41 Mon Sep 17 00:00:00 2001
From: Anusha Ranganathan <anusha@cottagelabs.com>
Date: Tue, 3 Dec 2024 12:40:59 +0530
Subject: [PATCH 1/3] Task to verify a Crc1280 import or an experiment in an
 import

---
 .../app/services/verify_crc_dataset_import.rb | 186 ++++++++++++++++++
 hyrax/lib/tasks/verify_import.rb              |  20 ++
 2 files changed, 206 insertions(+)
 create mode 100644 hyrax/app/services/verify_crc_dataset_import.rb
 create mode 100644 hyrax/lib/tasks/verify_import.rb

diff --git a/hyrax/app/services/verify_crc_dataset_import.rb b/hyrax/app/services/verify_crc_dataset_import.rb
new file mode 100644
index 00000000..34a17b23
--- /dev/null
+++ b/hyrax/app/services/verify_crc_dataset_import.rb
@@ -0,0 +1,186 @@
+require 'json'
+require 'csv'
+
+class VerifyCRCDatasetImport
+  def self.verify_import(import_id)
+    importer = ::Bulkrax::Importer.find(import_id)
+    work_entries = importer.entries.select {|e| e.class.name == "Bulkrax::CrcFolderEntry"}
+    import_status = true
+    report_paths = []
+    work_entries.each do |work_entry|
+      verification_status, file_path = self.verify_experiment_and_report(import_id, work_entry.id)
+      import_status = import_status && verification_status
+      report_paths.append(file_path)
+    end
+    return import_status, report_paths
+  end
+
+  def self.verify_experiment_and_report(import_id, entry_id)
+    verification_status, message, paths_compared, analysis = self.verify_experiment(entry_id, import_id)
+
+    # prepare csv report
+    report_name = "analysis_report_#{import_id}_#{entry_id}_#{Time.now.strftime("%m-%d_%H-%M-%S")}.csv"
+    report_path = File.join(ENV.fetch('DOWNLOAD_PATH', '/shared/downloads'), report_name)
+    headers = %w(id, status, message)
+
+    CSV.open(report_path, "w") do |csv|
+      # Write headers
+      csv << headers
+
+      # Write verification status
+      csv << [JSON.pretty_generate(paths_compared), verification_status, message]
+
+      # Write per file status
+      if analysis
+        # only in input
+        analysis.get(:only_in_input, []).each do |i|
+          csv << [i, false, "Only in input"]
+        end
+
+        # Only in work
+        analysis.get(:only_in_work, []).each do |i|
+          csv << [i, false, "Only in work"]
+        end
+
+        # in_both - size_analysis
+        analysis[:in_both_size_analysis].each do |fn_hash|
+          csv << [fn_hash[:key], fn_hash[:result], fn_hash[:message]]
+        end
+      end
+    end
+    return verification_status, report_path
+  end
+
+  def self.verify_experiment(entry_id, import_id)
+    # gather paths compared
+    paths_compared = {
+      entry_id: entry_id,
+      import_id: import_id
+    }
+
+    # Find importer entry
+    begin
+      entry = ::Bulkrax::CrcFolderEntry.find(entry_id)
+    rescue
+      msg = "Error: Import entry #{entry_id} not found"
+      return false, msg, paths_compared, nil
+    end
+    paths_compared[source_identifier: entry.identifier]
+
+    # Check import id
+    if import_id != entry.importerexporter_id
+      msg = "Error: Importer id #{import_id} does not match id associated with entry #{entry_id}"
+      return false, msg, paths_compared, nil
+    end
+
+    # Get importer
+    begin
+      importer = ::Bulkrax::Importer.find(import_id)
+    rescue
+      msg = "Error: Importer #{import_id} associated with entry #{entry_id} not found"
+      return false, msg, paths_compared, nil
+    end
+
+    # Get base import dir from importer
+    import_base_dir = importer.parser_fields['import_file_path']
+    paths_compared[:import_base_dir] = import_base_dir
+
+    # Get import path
+    import_folder_name = entry.raw_metadata['folder_name']
+    group_id = entry.raw_metadata['group_identifier']
+    import_path = File.join(import_base_dir, group_id, import_folder_name)
+    import_path =  import_path + '/' unless import_path.end_with?('/')
+    paths_compared[:import_path] = import_path
+
+    # Check import path exists
+    unless Dir.exist?(import_path)
+      msg = "The import directory #{import_path} was not found"
+      return false, msg, paths_compared, nil
+    end
+
+    # Check import path is not empty
+    if Dir.empty?(import_path)
+      msg = ["The import directory #{import_path} is empty"]
+      return false, msg, paths_compared, nil
+    end
+
+    # Get CrcDataset for the entry
+    works = CrcDataset.where(source: entry.identifier)
+    unless works.first.present?
+      msg = "The CrcDataset with source identifier #{entry.identifier} was not found"
+      return false, msg, paths_compared, nil
+    end
+    paths_compared[:work_id] = works.first.id
+
+    # Get files to compare
+    reserved_files = %w(meta.json metadata.json system_metadata.json)
+    # -- Get list of files from import directory
+    input_list = Dir.glob(File.join(import_path, '**', '*')).
+      reject {|fn| File.directory?(fn) }.
+      reject{|fn| reserved_files.include?(fn.split('/')[-1])}.
+      map{ |fn| fn.sub(import_path, '')}.sort
+    # -- Get list of objects from S3 for the CrcDataset
+    s3 = S3StorageService.new()
+    s3.init_client
+    bucket_id = s3.sanitise_name(works.first.id)
+    list_of_work_objects, _work_total_size = s3.list_all_objects(bucket_id)
+    list_of_works =  list_of_work_objects.map{|a| a[:key]}.
+      reject{|fn| reserved_files.include?(fn.split('/')[-1])}.sort
+
+    # Analyse files
+    in_both = input_list & list_of_works
+    only_in_input = input_list - list_of_works
+    only_in_work = list_of_works - input_list
+
+    # Analyse file size
+    size_analysis = []
+    sizes_match = true
+    in_both.each do |fn|
+      fp = File.join(import_path, fn)
+      input_size = File.size(fp)
+      selected_work = list_of_work_objects.find{|a| a[:key] == fn}
+      s3_size = selected_work.get(:size, 0)
+      if input_size == s3_size
+        message = "Size is the same - #{input_size}"
+      elsif input_size > s3_size
+        sizes_match = false
+        message = "Input file size #{input_size} is greater than object in S3 #{s3_size}"
+      else
+        sizes_match = false
+        message = "Input file size #{input_size} is smaller than object in S3 #{s3_size}"
+      end
+      size_analysis.append({
+                           key: fn,
+                           input_size: input_size,
+                           s3_size: s3_size,
+                           result: input_size == s3_size,
+                           message: message
+                         })
+    end
+
+    # Get verification status
+    verification_status = "Unknown"
+    if only_in_input.empty? and only_in_work.empty? and sizes_match
+      verification_status = true
+    end
+    if only_in_input.present? or only_in_work.present? or !sizes_match
+      verification_status = false
+    end
+
+    # gather the analysis
+    analysis = {
+      only_in_input: only_in_input,
+      only_in_work: only_in_work,
+      in_both_size_analysis: size_analysis
+    }
+
+    # return
+    return verification_status, "Comparison completed", paths_compared, analysis
+  end
+
+end
+
+
+
+
+
diff --git a/hyrax/lib/tasks/verify_import.rb b/hyrax/lib/tasks/verify_import.rb
new file mode 100644
index 00000000..c9677812
--- /dev/null
+++ b/hyrax/lib/tasks/verify_import.rb
@@ -0,0 +1,20 @@
+require 'json'
+
+namespace :rdms do
+  namespace :crc1280_import do
+    desc 'Verify CRC1280 import or verify CRC1280 import of an experiment. This will compare the files in the file path with the objects in the S3 bucket"]'
+    task :"verify_import", [:import_id] => :environment do |task, args|
+      puts "Verifying import #{args.import_id}"
+      import_status, report_paths = VerifyCRCDatasetImport.verify_import(args.import_id)
+      puts "import status : #{import_status}"
+      puts "Detailed reports are available at:"
+      puts JSON.pretty_generate(report_paths)
+    end
+    task :"verify_experiment", [:import_id, :entry_id] => :environment do |task, args|
+      puts "Verifying import #{args.import_id}"
+      experiment_status, report_path = VerifyCRCDatasetImport.verify_experiment_and_report(args.import_id, args.entry_id)
+      puts "Experiment import status : #{experiment_status}"
+      puts "Detailed reports is available at: #{report_path}"
+    end
+  end
+end
\ No newline at end of file
-- 
GitLab


From 2478b86173c2ce383dc26a1f91ce7a6a5942f18d Mon Sep 17 00:00:00 2001
From: Anusha Ranganathan <anusha@cottagelabs.com>
Date: Wed, 4 Dec 2024 02:12:08 +0530
Subject: [PATCH 2/3] Directory name lowercase and insert missing session
 folder

---
 .../services/prepare_csv_from_crc_folder.rb   |  3 +-
 .../app/services/verify_crc_dataset_import.rb | 72 ++++++++++++++-----
 2 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/hyrax/app/services/prepare_csv_from_crc_folder.rb b/hyrax/app/services/prepare_csv_from_crc_folder.rb
index a39d1a47..add3540b 100644
--- a/hyrax/app/services/prepare_csv_from_crc_folder.rb
+++ b/hyrax/app/services/prepare_csv_from_crc_folder.rb
@@ -321,6 +321,7 @@ class PrepareCsvFromCrcFolder
     # fix incorrect spelling in import data
     file_attributes['Modality'] = correct_spelling_of_modality(file_attributes['Modality'])
 
+    experiment_title = [file_attributes['Experiment title'], file_attributes['DublinCore-Title'], file_attributes['DataCite-Title']].find(&:present?)
     experiment_description = [file_attributes['Experiment Description'], file_attributes['DublinCore-Description'], file_attributes['DataCite-Description']].find(&:present?)
 
     [
@@ -332,7 +333,7 @@ class PrepareCsvFromCrcFolder
       [meta_info[:parent_work_source_identifier], meta_info[:parent_group_source_identifier]].compact.join(';'),
       '',
       '',
-      file_attributes['Experiment title'],
+      experiment_title,
       file_attributes['Contributor'],
       file_attributes['DataCite-contributorNameIdentifier'],
       file_attributes['DataCite-contributorType'],
diff --git a/hyrax/app/services/verify_crc_dataset_import.rb b/hyrax/app/services/verify_crc_dataset_import.rb
index 34a17b23..5c15153d 100644
--- a/hyrax/app/services/verify_crc_dataset_import.rb
+++ b/hyrax/app/services/verify_crc_dataset_import.rb
@@ -33,12 +33,12 @@ class VerifyCRCDatasetImport
       # Write per file status
       if analysis
         # only in input
-        analysis.get(:only_in_input, []).each do |i|
+        analysis.fetch(:only_in_input, []).each do |i|
           csv << [i, false, "Only in input"]
         end
 
         # Only in work
-        analysis.get(:only_in_work, []).each do |i|
+        analysis.fetch(:only_in_work, []).each do |i|
           csv << [i, false, "Only in work"]
         end
 
@@ -87,8 +87,17 @@ class VerifyCRCDatasetImport
 
     # Get import path
     import_folder_name = entry.raw_metadata['folder_name']
-    group_id = entry.raw_metadata['group_identifier']
-    import_path = File.join(import_base_dir, group_id, import_folder_name)
+    # NOTE: This has the risk of getting the path wrong if the glob finds multiple directories
+    wildcard_import_path = File.join(import_base_dir, '*', import_folder_name)
+    paths_found = Dir.glob(wildcard_import_path)
+    if paths_found.empty?
+      msg = "The import directory #{wildcard_import_path} was not found"
+      return false, msg, paths_compared, nil
+    elsif paths_found.size > 1
+      msg = "The import directory #{wildcard_import_path} matched multiple"
+      return false, msg, paths_compared, nil
+    end
+    import_path = paths_found[0]
     import_path =  import_path + '/' unless import_path.end_with?('/')
     paths_compared[:import_path] = import_path
 
@@ -113,33 +122,61 @@ class VerifyCRCDatasetImport
     paths_compared[:work_id] = works.first.id
 
     # Get files to compare
-    reserved_files = %w(meta.json metadata.json system_metadata.json)
+
     # -- Get list of files from import directory
     input_list = Dir.glob(File.join(import_path, '**', '*')).
       reject {|fn| File.directory?(fn) }.
-      reject{|fn| reserved_files.include?(fn.split('/')[-1])}.
+      reject{|fn| self.restricted_file_names?(fn.split('/')[-1])}.
       map{ |fn| fn.sub(import_path, '')}.sort
+    # ---- downcase folder name
+    sanitised_input_list = {}
+    input_list.each do |fp|
+      sanitised_input_list[fp] = fp
+      all_parts = fp.split('/')
+      if all_parts.size > 1
+        parts = all_parts[0...-1]
+        parts.map {|p| p.downcase!}
+        new_path = File.join(parts.join("/"), all_parts[-1])
+        sanitised_input_list[fp] = new_path
+      end
+    end
+    # ---- insert session folder
+    max_depth = 0
+    input_list.each do |fp|
+      parts = fp.split('/')
+      max_depth = parts.size if parts.size > max_depth
+    end
+    if max_depth == 3
+      sanitised_input_list.each do |fp, new_path|
+        parts = new_path.split('/')
+        if parts.size == 3
+          sanitised_input_list[fp] = File.join(parts[0], 'ses 01', parts[1], parts[2])
+        end
+      end
+    end
+
     # -- Get list of objects from S3 for the CrcDataset
     s3 = S3StorageService.new()
     s3.init_client
     bucket_id = s3.sanitise_name(works.first.id)
     list_of_work_objects, _work_total_size = s3.list_all_objects(bucket_id)
     list_of_works =  list_of_work_objects.map{|a| a[:key]}.
-      reject{|fn| reserved_files.include?(fn.split('/')[-1])}.sort
+      reject{|fn| self.restricted_file_names?(fn.split('/')[-1])}.sort
 
     # Analyse files
-    in_both = input_list & list_of_works
-    only_in_input = input_list - list_of_works
-    only_in_work = list_of_works - input_list
+    in_both = sanitised_input_list.values & list_of_works
+    only_in_input = sanitised_input_list.values - list_of_works
+    only_in_work = list_of_works - sanitised_input_list.values
 
     # Analyse file size
     size_analysis = []
     sizes_match = true
     in_both.each do |fn|
-      fp = File.join(import_path, fn)
+      original_fn = sanitised_input_list.key(fn)
+      fp = File.join(import_path, original_fn)
       input_size = File.size(fp)
       selected_work = list_of_work_objects.find{|a| a[:key] == fn}
-      s3_size = selected_work.get(:size, 0)
+      s3_size = selected_work.fetch(:size, 0)
       if input_size == s3_size
         message = "Size is the same - #{input_size}"
       elsif input_size > s3_size
@@ -178,9 +215,10 @@ class VerifyCRCDatasetImport
     return verification_status, "Comparison completed", paths_compared, analysis
   end
 
-end
-
-
-
-
+  def self.restricted_file_names?(file_name)
+    parsed_file =  YAML.load_file(Rails.root.join('config', 'restricted_files.yml'))
+    parsed_file['restricted_file_names'].append('system_metadata.json')
+    parsed_file['restricted_file_names'].map(&:downcase).include?(file_name.downcase)
+  end
 
+end
-- 
GitLab


From dc76e842b8b8daead827cdbeea18624ff1ec5766 Mon Sep 17 00:00:00 2001
From: Anusha Ranganathan <anusha@cottagelabs.com>
Date: Thu, 5 Dec 2024 22:04:34 +0530
Subject: [PATCH 3/3] Rename file with correct extension

---
 .../tasks/{verify_import.rb => verify_crc_1280_import.rake}   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename hyrax/lib/tasks/{verify_import.rb => verify_crc_1280_import.rake} (96%)

diff --git a/hyrax/lib/tasks/verify_import.rb b/hyrax/lib/tasks/verify_crc_1280_import.rake
similarity index 96%
rename from hyrax/lib/tasks/verify_import.rb
rename to hyrax/lib/tasks/verify_crc_1280_import.rake
index c9677812..3297ccb5 100644
--- a/hyrax/lib/tasks/verify_import.rb
+++ b/hyrax/lib/tasks/verify_crc_1280_import.rake
@@ -1,7 +1,7 @@
 require 'json'
 
 namespace :rdms do
-  namespace :crc1280_import do
+  namespace :crc_1280_import do
     desc 'Verify CRC1280 import or verify CRC1280 import of an experiment. This will compare the files in the file path with the objects in the S3 bucket"]'
     task :"verify_import", [:import_id] => :environment do |task, args|
       puts "Verifying import #{args.import_id}"
@@ -17,4 +17,4 @@ namespace :rdms do
       puts "Detailed reports is available at: #{report_path}"
     end
   end
-end
\ No newline at end of file
+end
-- 
GitLab