From 65bb83fd7739d927ed970c4b7555dc01b9857c69 Mon Sep 17 00:00:00 2001 From: Anusha Ranganathan <anusha@cottagelabs.com> Date: Thu, 5 Dec 2024 19:04:39 +0100 Subject: [PATCH] Verify CRC1280 import task --- .../services/prepare_csv_from_crc_folder.rb | 3 +- .../app/services/verify_crc_dataset_import.rb | 224 ++++++++++++++++++ hyrax/lib/tasks/verify_crc_1280_import.rake | 20 ++ 3 files changed, 246 insertions(+), 1 deletion(-) create mode 100644 hyrax/app/services/verify_crc_dataset_import.rb create mode 100644 hyrax/lib/tasks/verify_crc_1280_import.rake diff --git a/hyrax/app/services/prepare_csv_from_crc_folder.rb b/hyrax/app/services/prepare_csv_from_crc_folder.rb index a39d1a47..add3540b 100644 --- a/hyrax/app/services/prepare_csv_from_crc_folder.rb +++ b/hyrax/app/services/prepare_csv_from_crc_folder.rb @@ -321,6 +321,7 @@ class PrepareCsvFromCrcFolder # fix incorrect spelling in import data file_attributes['Modality'] = correct_spelling_of_modality(file_attributes['Modality']) + experiment_title = [file_attributes['Experiment title'], file_attributes['DublinCore-Title'], file_attributes['DataCite-Title']].find(&:present?) experiment_description = [file_attributes['Experiment Description'], file_attributes['DublinCore-Description'], file_attributes['DataCite-Description']].find(&:present?) [ @@ -332,7 +333,7 @@ class PrepareCsvFromCrcFolder [meta_info[:parent_work_source_identifier], meta_info[:parent_group_source_identifier]].compact.join(';'), '', '', - file_attributes['Experiment title'], + experiment_title, file_attributes['Contributor'], file_attributes['DataCite-contributorNameIdentifier'], file_attributes['DataCite-contributorType'], diff --git a/hyrax/app/services/verify_crc_dataset_import.rb b/hyrax/app/services/verify_crc_dataset_import.rb new file mode 100644 index 00000000..5c15153d --- /dev/null +++ b/hyrax/app/services/verify_crc_dataset_import.rb @@ -0,0 +1,224 @@ +require 'json' +require 'csv' + +class VerifyCRCDatasetImport + def self.verify_import(import_id) + importer = ::Bulkrax::Importer.find(import_id) + work_entries = importer.entries.select {|e| e.class.name == "Bulkrax::CrcFolderEntry"} + import_status = true + report_paths = [] + work_entries.each do |work_entry| + verification_status, file_path = self.verify_experiment_and_report(import_id, work_entry.id) + import_status = import_status && verification_status + report_paths.append(file_path) + end + return import_status, report_paths + end + + def self.verify_experiment_and_report(import_id, entry_id) + verification_status, message, paths_compared, analysis = self.verify_experiment(entry_id, import_id) + + # prepare csv report + report_name = "analysis_report_#{import_id}_#{entry_id}_#{Time.now.strftime("%m-%d_%H-%M-%S")}.csv" + report_path = File.join(ENV.fetch('DOWNLOAD_PATH', '/shared/downloads'), report_name) + headers = %w(id, status, message) + + CSV.open(report_path, "w") do |csv| + # Write headers + csv << headers + + # Write verification status + csv << [JSON.pretty_generate(paths_compared), verification_status, message] + + # Write per file status + if analysis + # only in input + analysis.fetch(:only_in_input, []).each do |i| + csv << [i, false, "Only in input"] + end + + # Only in work + analysis.fetch(:only_in_work, []).each do |i| + csv << [i, false, "Only in work"] + end + + # in_both - size_analysis + analysis[:in_both_size_analysis].each do |fn_hash| + csv << [fn_hash[:key], fn_hash[:result], fn_hash[:message]] + end + end + end + return verification_status, report_path + end + + def self.verify_experiment(entry_id, import_id) + # gather paths compared + paths_compared = { + entry_id: entry_id, + import_id: import_id + } + + # Find importer entry + begin + entry = ::Bulkrax::CrcFolderEntry.find(entry_id) + rescue + msg = "Error: Import entry #{entry_id} not found" + return false, msg, paths_compared, nil + end + paths_compared[source_identifier: entry.identifier] + + # Check import id + if import_id != entry.importerexporter_id + msg = "Error: Importer id #{import_id} does not match id associated with entry #{entry_id}" + return false, msg, paths_compared, nil + end + + # Get importer + begin + importer = ::Bulkrax::Importer.find(import_id) + rescue + msg = "Error: Importer #{import_id} associated with entry #{entry_id} not found" + return false, msg, paths_compared, nil + end + + # Get base import dir from importer + import_base_dir = importer.parser_fields['import_file_path'] + paths_compared[:import_base_dir] = import_base_dir + + # Get import path + import_folder_name = entry.raw_metadata['folder_name'] + # NOTE: This has the risk of getting the path wrong if the glob finds multiple directories + wildcard_import_path = File.join(import_base_dir, '*', import_folder_name) + paths_found = Dir.glob(wildcard_import_path) + if paths_found.empty? + msg = "The import directory #{wildcard_import_path} was not found" + return false, msg, paths_compared, nil + elsif paths_found.size > 1 + msg = "The import directory #{wildcard_import_path} matched multiple" + return false, msg, paths_compared, nil + end + import_path = paths_found[0] + import_path = import_path + '/' unless import_path.end_with?('/') + paths_compared[:import_path] = import_path + + # Check import path exists + unless Dir.exist?(import_path) + msg = "The import directory #{import_path} was not found" + return false, msg, paths_compared, nil + end + + # Check import path is not empty + if Dir.empty?(import_path) + msg = ["The import directory #{import_path} is empty"] + return false, msg, paths_compared, nil + end + + # Get CrcDataset for the entry + works = CrcDataset.where(source: entry.identifier) + unless works.first.present? + msg = "The CrcDataset with source identifier #{entry.identifier} was not found" + return false, msg, paths_compared, nil + end + paths_compared[:work_id] = works.first.id + + # Get files to compare + + # -- Get list of files from import directory + input_list = Dir.glob(File.join(import_path, '**', '*')). + reject {|fn| File.directory?(fn) }. + reject{|fn| self.restricted_file_names?(fn.split('/')[-1])}. + map{ |fn| fn.sub(import_path, '')}.sort + # ---- downcase folder name + sanitised_input_list = {} + input_list.each do |fp| + sanitised_input_list[fp] = fp + all_parts = fp.split('/') + if all_parts.size > 1 + parts = all_parts[0...-1] + parts.map {|p| p.downcase!} + new_path = File.join(parts.join("/"), all_parts[-1]) + sanitised_input_list[fp] = new_path + end + end + # ---- insert session folder + max_depth = 0 + input_list.each do |fp| + parts = fp.split('/') + max_depth = parts.size if parts.size > max_depth + end + if max_depth == 3 + sanitised_input_list.each do |fp, new_path| + parts = new_path.split('/') + if parts.size == 3 + sanitised_input_list[fp] = File.join(parts[0], 'ses 01', parts[1], parts[2]) + end + end + end + + # -- Get list of objects from S3 for the CrcDataset + s3 = S3StorageService.new() + s3.init_client + bucket_id = s3.sanitise_name(works.first.id) + list_of_work_objects, _work_total_size = s3.list_all_objects(bucket_id) + list_of_works = list_of_work_objects.map{|a| a[:key]}. + reject{|fn| self.restricted_file_names?(fn.split('/')[-1])}.sort + + # Analyse files + in_both = sanitised_input_list.values & list_of_works + only_in_input = sanitised_input_list.values - list_of_works + only_in_work = list_of_works - sanitised_input_list.values + + # Analyse file size + size_analysis = [] + sizes_match = true + in_both.each do |fn| + original_fn = sanitised_input_list.key(fn) + fp = File.join(import_path, original_fn) + input_size = File.size(fp) + selected_work = list_of_work_objects.find{|a| a[:key] == fn} + s3_size = selected_work.fetch(:size, 0) + if input_size == s3_size + message = "Size is the same - #{input_size}" + elsif input_size > s3_size + sizes_match = false + message = "Input file size #{input_size} is greater than object in S3 #{s3_size}" + else + sizes_match = false + message = "Input file size #{input_size} is smaller than object in S3 #{s3_size}" + end + size_analysis.append({ + key: fn, + input_size: input_size, + s3_size: s3_size, + result: input_size == s3_size, + message: message + }) + end + + # Get verification status + verification_status = "Unknown" + if only_in_input.empty? and only_in_work.empty? and sizes_match + verification_status = true + end + if only_in_input.present? or only_in_work.present? or !sizes_match + verification_status = false + end + + # gather the analysis + analysis = { + only_in_input: only_in_input, + only_in_work: only_in_work, + in_both_size_analysis: size_analysis + } + + # return + return verification_status, "Comparison completed", paths_compared, analysis + end + + def self.restricted_file_names?(file_name) + parsed_file = YAML.load_file(Rails.root.join('config', 'restricted_files.yml')) + parsed_file['restricted_file_names'].append('system_metadata.json') + parsed_file['restricted_file_names'].map(&:downcase).include?(file_name.downcase) + end + +end diff --git a/hyrax/lib/tasks/verify_crc_1280_import.rake b/hyrax/lib/tasks/verify_crc_1280_import.rake new file mode 100644 index 00000000..3297ccb5 --- /dev/null +++ b/hyrax/lib/tasks/verify_crc_1280_import.rake @@ -0,0 +1,20 @@ +require 'json' + +namespace :rdms do + namespace :crc_1280_import do + desc 'Verify CRC1280 import or verify CRC1280 import of an experiment. This will compare the files in the file path with the objects in the S3 bucket"]' + task :"verify_import", [:import_id] => :environment do |task, args| + puts "Verifying import #{args.import_id}" + import_status, report_paths = VerifyCRCDatasetImport.verify_import(args.import_id) + puts "import status : #{import_status}" + puts "Detailed reports are available at:" + puts JSON.pretty_generate(report_paths) + end + task :"verify_experiment", [:import_id, :entry_id] => :environment do |task, args| + puts "Verifying import #{args.import_id}" + experiment_status, report_path = VerifyCRCDatasetImport.verify_experiment_and_report(args.import_id, args.entry_id) + puts "Experiment import status : #{experiment_status}" + puts "Detailed reports is available at: #{report_path}" + end + end +end -- GitLab