#!/bin/bash # compare_dirs.sh # # Usage: ./compare_dirs.sh [--dry-run] [--threshold ] # # This script: # - Scans immediate subdirectories in and # - Normalizes and “cleans” their names by removing punctuation, converting to lower-case, # trimming whitespace, and removing any words listed in # - Groups directories using a fuzzy matching algorithm (with a configurable threshold) # - Automatically removes a directory from a duplicate group if its original name contains an # undesirable word while an alternative does not # - Then automatically removes any remaining duplicates in each group (keeping the first directory) # - Supports a --dry-run mode that shows actions without deleting directories set -euo pipefail # Default options DRY_RUN=false SIMILARITY_THRESHOLD=0.8 # Process command-line flags while [[ "$1" == --* ]]; do case "$1" in --dry-run) DRY_RUN=true shift ;; --threshold) SIMILARITY_THRESHOLD="$2" shift 2 ;; *) echo "Unknown option: $1" exit 1 ;; esac done if [ "$#" -ne 3 ]; then echo "Usage: $0 [--dry-run] [--threshold ] " exit 1 fi DIR1="$1" DIR2="$2" WORDS_FILE="$3" # Verify input paths if [ ! -d "$DIR1" ]; then echo "Error: Directory '$DIR1' not found." exit 1 fi if [ ! -d "$DIR2" ]; then echo "Error: Directory '$DIR2' not found." exit 1 fi if [ ! -f "$WORDS_FILE" ]; then echo "Error: Words file '$WORDS_FILE' not found." exit 1 fi # Read undesirable words (one per line) into an array, filtering out blank lines. mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE") # Function: Normalize and clean a directory name. clean_name() { local name="$1" # Normalize: convert to lower-case, remove punctuation, and trim extra whitespace. local normalized normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs) local cleaned="$normalized" for word in "${words[@]}"; do # Remove the word (case-insensitive). cleaned=$(echo "$cleaned" | sed "s/$(echo "$word" | tr '[:upper:]' '[:lower:]')//Ig") done # Trim again in case extra spaces were left. cleaned=$(echo "$cleaned" | xargs) echo "$cleaned" } # Function: Compute fuzzy similarity between two names using Python's difflib. compute_similarity() { local name1="$1" local name2="$2" python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2" } # Initialize grouping arrays explicitly. declare -a group_rep=() # Array for representative cleaned names. declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths. # Process immediate subdirectories from both DIR1 and DIR2. for d in "$DIR1"/* "$DIR2"/*; do if [ -d "$d" ]; then base=$(basename "$d") cleaned=$(clean_name "$base") added=false # Compare against each existing group's representative. for i in "${!group_rep[@]}"; do rep="${group_rep[$i]}" sim=$(compute_similarity "$rep" "$cleaned") if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then groups["$i"]+=$'\n'"$d" added=true break fi done # If no similar group found, create a new group. if [ "$added" = false ]; then new_index=${#group_rep[@]} group_rep+=("$cleaned") groups["$new_index"]="$d" fi fi done echo "=== Automatic Removal Based on Undesirable Words ===" # For each duplicate group, automatically remove directories whose original names contain # an undesirable word if at least one alternative in the group does not. for key in "${!groups[@]}"; do IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true if [ "${#paths[@]}" -gt 1 ]; then for path in "${paths[@]}"; do base=$(basename "$path") for word in "${words[@]}"; do if echo "$base" | grep -qi "$word"; then removal_candidate=false for other in "${paths[@]}"; do if [ "$other" != "$path" ]; then other_base=$(basename "$other") if ! echo "$other_base" | grep -qi "$word"; then removal_candidate=true break fi fi done if $removal_candidate; then echo "Candidate for auto-removal: $path (matches word: '$word')" if $DRY_RUN; then echo "Dry-run: would remove $path" else rm -rf "$path" echo "Removed $path" fi # Update the group by removing the candidate. new_group=() for p in "${paths[@]}"; do if [ "$p" != "$path" ]; then new_group+=("$p") fi done groups["$key"]=$(printf "%s\n" "${new_group[@]}") # Refresh the paths array. IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true break 2 fi fi done done fi done echo "=== Auto-Removing Remaining Duplicates (Keeping the First Entry) ===" # For any remaining duplicate groups, automatically remove all but the first directory. for key in "${!groups[@]}"; do IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true # Filter out directories that no longer exist. existing=() for path in "${paths[@]}"; do if [ -d "$path" ]; then existing+=("$path") fi done if [ "${#existing[@]}" -gt 1 ]; then echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}):" echo "Keeping: ${existing[0]}" for (( i=1; i<${#existing[@]}; i++ )); do echo "Auto-removing: ${existing[$i]}" if $DRY_RUN; then echo "Dry-run: would remove ${existing[$i]}" else rm -rf "${existing[$i]}" echo "Removed ${existing[$i]}" fi done fi done echo "Script completed."