#!/bin/bash # compare_dirs_improved.sh # # Usage: ./compare_dirs_improved.sh [--dry-run] [--threshold ] [--config ] [ ] # # This script: # 1. Scans immediate subdirectories in and in parallel. # 2. For each directory, if its name contains any undesirable word (one per line in ), # the directory is removed outright. # 3. The remaining directories are "cleaned" (converted to lowercase, punctuation removed) # and then grouped by fuzzy similarity using a configurable threshold. # The fuzzy similarity process is optimized with a multiprocessing helper. # 4. Within each group, if one directory's name contains "2160p" and another contains "1080p", # the 1080p directory(ies) are removed (or flagged in dry-run mode). # 5. For any remaining duplicate groups, the user is prompted to select a directory to remove. # 6. A --dry-run mode is available to preview removals without actually deleting any directories. # 7. Supports configuration files for persistent settings. # 8. Provides comprehensive logging of operations. set -euo pipefail # Default configuration file location CONFIG_FILE="./compare_dirs.conf" # Initialize log function log() { local level="$1" local message="$2" local timestamp=$(date "+%Y-%m-%d %H:%M:%S") # Only log if logging is enabled if [[ "$LOG_ENABLED" == "true" ]]; then # Log level filtering case "$LOG_LEVEL" in "DEBUG") ;; "INFO") if [[ "$level" == "DEBUG" ]]; then return; fi ;; "WARNING") if [[ "$level" == "DEBUG" || "$level" == "INFO" ]]; then return; fi ;; "ERROR") if [[ "$level" == "DEBUG" || "$level" == "INFO" || "$level" == "WARNING" ]]; then return; fi ;; esac # Print to console with color case "$level" in "DEBUG") echo -e "\033[36m[$timestamp] [$level] $message\033[0m" ;; # Cyan "INFO") echo -e "\033[32m[$timestamp] [$level] $message\033[0m" ;; # Green "WARNING") echo -e "\033[33m[$timestamp] [$level] $message\033[0m" ;; # Yellow "ERROR") echo -e "\033[31m[$timestamp] [$level] $message\033[0m" ;; # Red *) echo "[$timestamp] [$level] $message" ;; esac # Write to log file (without color codes) echo "[$timestamp] [$level] $message" >> "$LOG_FILE" fi } # Default options DRY_RUN=false SIMILARITY_THRESHOLD=0.8 LOG_ENABLED=true LOG_FILE="./compare_dirs.log" LOG_LEVEL="INFO" PARALLEL_PROCESSES=0 # Load configuration file if it exists load_config() { local config_file="$1" if [[ -f "$config_file" ]]; then # Source the config file source "$config_file" echo "Configuration loaded from $config_file" return 0 else echo "Warning: Configuration file '$config_file' not found. Using defaults." return 1 fi } # Process directories in parallel process_directories_parallel() { local dir="$1" local max_procs="${2:-4}" # Default to 4 processes if not specified local temp_dir=$(mktemp -d) local pids=() local count=0 local result=() if [[ ! -d "$dir" ]]; then log "ERROR" "Directory '$dir' not found in parallel processing." return 1 } # If PARALLEL_PROCESSES is 0, use available CPU cores if [[ "$max_procs" -eq 0 ]]; then max_procs=$(nproc 2>/dev/null || echo 4) fi log "DEBUG" "Processing directory '$dir' with $max_procs parallel processes" # Get all directories to process local all_dirs=() for d in "$dir"/*; do if [[ -d "$d" ]]; then all_dirs+=("$d") fi done local total_dirs=${#all_dirs[@]} log "DEBUG" "Found $total_dirs directories to process" # Process in batches based on max_procs for ((i=0; i "$output_file" & pids+=($!) ((count++)) # If we've reached max_procs or this is the last directory, wait for processes to finish if [[ ${#pids[@]} -eq $max_procs || $i -eq $((total_dirs-1)) ]]; then for pid in "${pids[@]}"; do wait "$pid" done # Read results for ((j=0; j<${#pids[@]}; j++)); do local file="$temp_dir/$j" while IFS= read -r line; do result+=("$line") done < "$file" done # Reset for next batch pids=() count=0 fi done # Clean up temporary directory rm -rf "$temp_dir" # Output results for line in "${result[@]}"; do echo "$line" done } # Process command-line flags while [[ $# -gt 0 && "$1" == --* ]]; do case "$1" in --dry-run) DRY_RUN=true shift ;; --threshold) SIMILARITY_THRESHOLD="$2" shift 2 ;; --config) CONFIG_FILE="$2" shift 2 ;; --log-file) LOG_FILE="$2" shift 2 ;; --log-level) LOG_LEVEL="$2" shift 2 ;; --parallel) PARALLEL_PROCESSES="$2" shift 2 ;; --help) echo "Usage: $0 [OPTIONS] [ ]" echo echo "OPTIONS:" echo " --dry-run Preview actions without deleting any directories" echo " --threshold Set the fuzzy similarity threshold (default: 0.8)" echo " --config Specify a configuration file to use" echo " --log-file Specify a log file to use" echo " --log-level Set log level: DEBUG, INFO, WARNING, ERROR" echo " --parallel Number of parallel processes (0 = auto)" echo " --help Display this help message" echo echo "If no directories and words file are specified, values from config file will be used." exit 0 ;; *) echo "Unknown option: $1" echo "Use --help for usage information." exit 1 ;; esac done # Load configuration file if specified if [[ -f "$CONFIG_FILE" ]]; then load_config "$CONFIG_FILE" fi # Initialize logging if [[ "$LOG_ENABLED" == "true" ]]; then touch "$LOG_FILE" log "INFO" "Logging initialized" fi # Check if arguments were provided to override config values if [ $# -eq 3 ]; then DIR1="$1" DIR2="$2" WORDS_FILE="$3" log "INFO" "Using command-line arguments for directories and words file" elif [ $# -ne 0 ]; then log "ERROR" "Incorrect number of arguments" echo "Usage: $0 [--dry-run] [--threshold ] [--config ] [ ]" exit 1 fi log "INFO" "Script started with parameters: DIR1=$DIR1, DIR2=$DIR2, WORDS_FILE=$WORDS_FILE" log "INFO" "Configuration: THRESHOLD=$SIMILARITY_THRESHOLD, DRY_RUN=$DRY_RUN, PARALLEL_PROCESSES=$PARALLEL_PROCESSES" # Verify input paths if [ ! -d "$DIR1" ]; then log "ERROR" "Directory '$DIR1' not found." echo "Error: Directory '$DIR1' not found." exit 1 fi if [ ! -d "$DIR2" ]; then log "ERROR" "Directory '$DIR2' not found." echo "Error: Directory '$DIR2' not found." exit 1 fi if [ ! -f "$WORDS_FILE" ]; then log "ERROR" "Words file '$WORDS_FILE' not found." echo "Error: Words file '$WORDS_FILE' not found." exit 1 fi # Read undesirable words (one per line) into an array, ignoring blank lines. mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE") log "INFO" "Loaded ${#words[@]} undesirable words from $WORDS_FILE" echo "=== Pre-filtering Directories by Undesirable Words ===" log "INFO" "Starting parallel directory filtering" # Process directories in parallel and process results filtered_dirs=() # Process DIR1 log "INFO" "Processing directories in $DIR1" while IFS= read -r line; do if [[ "$line" == KEEP:* ]]; then dir="${line#KEEP:}" filtered_dirs+=("$dir") log "DEBUG" "Keeping directory: $dir" elif [[ "$line" == REMOVE:* ]]; then dir="${line#REMOVE:}" log "INFO" "Removing directory with undesirable word: $dir" if $DRY_RUN; then echo "Dry-run: would remove '$dir'" log "INFO" "Dry-run: would remove '$dir'" else rm -rf "$dir" log "INFO" "Removed '$dir'" echo "Removed '$dir'" fi fi done < <(process_directories_parallel "$DIR1" "$PARALLEL_PROCESSES") # Process DIR2 log "INFO" "Processing directories in $DIR2" while IFS= read -r line; do if [[ "$line" == KEEP:* ]]; then dir="${line#KEEP:}" filtered_dirs+=("$dir") log "DEBUG" "Keeping directory: $dir" elif [[ "$line" == REMOVE:* ]]; then dir="${line#REMOVE:}" log "INFO" "Removing directory with undesirable word: $dir" if $DRY_RUN; then echo "Dry-run: would remove '$dir'" log "INFO" "Dry-run: would remove '$dir'" else rm -rf "$dir" log "INFO" "Removed '$dir'" echo "Removed '$dir'" fi fi done < <(process_directories_parallel "$DIR2" "$PARALLEL_PROCESSES") log "INFO" "Filtered directories remaining: ${#filtered_dirs[@]}" # Function: Normalize and clean a directory name. clean_name() { local name="$1" echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs } # Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing. compute_similarities() { local target="$1" shift # Pass target and the list of representatives as command-line arguments to Python. python3 - "$target" "$@" <= $SIMILARITY_THRESHOLD" | bc -l) )); then groups["$i"]+=$'\n'"$d" log "DEBUG" "Added '$d' to group $i (${group_rep[$i]})" added=true break fi done fi if [ "$added" = false ]; then new_index=${#group_rep[@]} group_rep+=("$cleaned") groups["$new_index"]="$d" log "DEBUG" "Created new group $new_index with representative '$cleaned'" fi done log "INFO" "Created ${#group_rep[@]} groups after fuzzy similarity matching" echo "=== Resolution Preference Filtering ===" log "INFO" "Starting resolution preference filtering" # For each group, if one directory contains "2160p" and another contains "1080p", # remove the 1080p directory(ies). for key in "${!groups[@]}"; do IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true has_2160p=false has_1080p=false for path in "${paths[@]}"; do base=$(basename "$path") if echo "$base" | grep -qi "2160p"; then has_2160p=true log "DEBUG" "Found 2160p in group $key: $path" fi if echo "$base" | grep -qi "1080p"; then has_1080p=true log "DEBUG" "Found 1080p in group $key: $path" fi done if $has_2160p && $has_1080p; then log "INFO" "Group $key (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories" echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories." new_group=() for path in "${paths[@]}"; do base=$(basename "$path") if echo "$base" | grep -qi "1080p"; then log "INFO" "Removing '$path' because a 2160p version is present" echo "Removing '$path' because a 2160p version is present." if $DRY_RUN; then echo "Dry-run: would remove '$path'" log "INFO" "Dry-run: would remove '$path'" else rm -rf "$path" log "INFO" "Removed '$path'" echo "Removed '$path'" fi else new_group+=("$path") fi done groups["$key"]=$(printf "%s\n" "${new_group[@]}") fi done echo "=== Interactive Duplicate Resolution ===" log "INFO" "Starting interactive duplicate resolution" # For each group that still contains more than one directory, prompt the user to select one to remove. for key in "${!groups[@]}"; do IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true # Filter out directories that no longer exist. existing=() for path in "${paths[@]}"; do if [ -d "$path" ]; then existing+=("$path") fi done if [ "${#existing[@]}" -gt 1 ]; then log "INFO" "Prompting user for duplicate group: ${group_rep[$key]:-unknown}" echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):" i=1 for p in "${existing[@]}"; do echo " [$i] $p" ((i++)) done echo -n "Enter the number of the directory you want to remove (or 0 to skip): " read -r choice if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then dir_to_remove="${existing[$((choice-1))]}" log "INFO" "User selected to remove: $dir_to_remove" if $DRY_RUN; then echo "Dry-run: would remove '$dir_to_remove'" log "INFO" "Dry-run: would remove '$dir_to_remove'" else rm -rf "$dir_to_remove" log "INFO" "Removed '$dir_to_remove'" echo "Removed '$dir_to_remove'" fi else log "INFO" "User skipped removal for group: ${group_rep[$key]:-unknown}" echo "No removal selected for this group." fi fi done log "INFO" "Script completed successfully" echo "Script completed. See $LOG_FILE for detailed log."