diff --git a/compare b/compare index a7662c1..44e566c 100755 --- a/compare +++ b/compare @@ -4,14 +4,13 @@ # Usage: ./compare_dirs.sh [--dry-run] [--threshold ] # # This script: -# - Scans immediate subdirectories in and -# - Normalizes and “cleans” their names by removing punctuation, converting to lower-case, -# trimming whitespace, and removing any words listed in -# - Groups directories using a fuzzy matching algorithm (with a configurable threshold) -# - Automatically removes a directory from a duplicate group if its original name contains an -# undesirable word while an alternative does not -# - Then automatically removes any remaining duplicates in each group (keeping the first directory) -# - Supports a --dry-run mode that shows actions without deleting directories +# 1. Scans immediate subdirectories in and . +# 2. For each directory, if its name contains any undesirable word (one per line in ), +# the directory is removed outright. +# 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed) +# and then grouped by fuzzy similarity using a configurable threshold. +# 4. If duplicate groups remain, you are prompted to choose which duplicate to remove. +# 5. A dry-run mode (--dry-run) is available to show what would be removed without deleting. set -euo pipefail @@ -60,109 +59,85 @@ if [ ! -f "$WORDS_FILE" ]; then exit 1 fi -# Read undesirable words (one per line) into an array, filtering out blank lines. +# Read undesirable words (one per line) into an array (ignoring blank lines) mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE") +echo "=== Pre-filtering Directories by Undesirable Words ===" +# Create an array to hold directories that do NOT match any undesirable word. +filtered_dirs=() + +# Loop over immediate subdirectories in both directories. +for d in "$DIR1"/* "$DIR2"/*; do + if [ -d "$d" ]; then + base=$(basename "$d") + remove_flag=false + # Check if the directory name contains any undesirable word (case-insensitive). + for word in "${words[@]}"; do + if echo "$base" | grep -qi "$word"; then + remove_flag=true + break + fi + done + if $remove_flag; then + echo "Removing '$d' because it contains an undesirable word." + if $DRY_RUN; then + echo "Dry-run: would remove '$d'" + else + rm -rf "$d" + echo "Removed '$d'" + fi + else + filtered_dirs+=("$d") + fi + fi +done + # Function: Normalize and clean a directory name. clean_name() { local name="$1" - # Normalize: convert to lower-case, remove punctuation, and trim extra whitespace. + # Normalize: convert to lowercase, remove punctuation, and trim extra whitespace. local normalized normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs) - local cleaned="$normalized" - for word in "${words[@]}"; do - # Remove the word (case-insensitive). - cleaned=$(echo "$cleaned" | sed "s/$(echo "$word" | tr '[:upper:]' '[:lower:]')//Ig") - done - # Trim again in case extra spaces were left. - cleaned=$(echo "$cleaned" | xargs) - echo "$cleaned" + echo "$normalized" } -# Function: Compute fuzzy similarity between two names using Python's difflib. +# Function: Compute fuzzy similarity between two strings using Python's difflib. compute_similarity() { local name1="$1" local name2="$2" python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2" } -# Initialize grouping arrays explicitly. +echo "=== Grouping Remaining Directories by Fuzzy Similarity ===" +# Initialize grouping arrays. declare -a group_rep=() # Array for representative cleaned names. declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths. -# Process immediate subdirectories from both DIR1 and DIR2. -for d in "$DIR1"/* "$DIR2"/*; do - if [ -d "$d" ]; then - base=$(basename "$d") - cleaned=$(clean_name "$base") - added=false - # Compare against each existing group's representative. - for i in "${!group_rep[@]}"; do - rep="${group_rep[$i]}" - sim=$(compute_similarity "$rep" "$cleaned") - if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then - groups["$i"]+=$'\n'"$d" - added=true - break - fi - done - # If no similar group found, create a new group. - if [ "$added" = false ]; then - new_index=${#group_rep[@]} - group_rep+=("$cleaned") - groups["$new_index"]="$d" +# Group the directories in filtered_dirs. +for d in "${filtered_dirs[@]}"; do + base=$(basename "$d") + cleaned=$(clean_name "$base") + added=false + # Compare with each existing group's representative. + for i in "${!group_rep[@]}"; do + rep="${group_rep[$i]}" + sim=$(compute_similarity "$rep" "$cleaned") + if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then + groups["$i"]+=$'\n'"$d" + added=true + break fi + done + # If not added to an existing group, create a new group. + if [ "$added" = false ]; then + new_index=${#group_rep[@]} + group_rep+=("$cleaned") + groups["$new_index"]="$d" fi done -echo "=== Automatic Removal Based on Undesirable Words ===" -# For each duplicate group, automatically remove directories whose original names contain -# an undesirable word if at least one alternative in the group does not. -for key in "${!groups[@]}"; do - IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true - if [ "${#paths[@]}" -gt 1 ]; then - for path in "${paths[@]}"; do - base=$(basename "$path") - for word in "${words[@]}"; do - if echo "$base" | grep -qi "$word"; then - removal_candidate=false - for other in "${paths[@]}"; do - if [ "$other" != "$path" ]; then - other_base=$(basename "$other") - if ! echo "$other_base" | grep -qi "$word"; then - removal_candidate=true - break - fi - fi - done - if $removal_candidate; then - echo "Candidate for auto-removal: $path (matches word: '$word')" - if $DRY_RUN; then - echo "Dry-run: would remove $path" - else - rm -rf "$path" - echo "Removed $path" - fi - # Update the group by removing the candidate. - new_group=() - for p in "${paths[@]}"; do - if [ "$p" != "$path" ]; then - new_group+=("$p") - fi - done - groups["$key"]=$(printf "%s\n" "${new_group[@]}") - # Refresh the paths array. - IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true - break 2 - fi - fi - done - done - fi -done - -echo "=== Auto-Removing Remaining Duplicates (Keeping the First Entry) ===" -# For any remaining duplicate groups, automatically remove all but the first directory. +echo "=== Interactive Duplicate Resolution ===" +# For each group that has more than one directory, prompt the user to select one to remove. for key in "${!groups[@]}"; do IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true # Filter out directories that no longer exist. @@ -173,18 +148,27 @@ for key in "${!groups[@]}"; do fi done if [ "${#existing[@]}" -gt 1 ]; then - echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}):" - echo "Keeping: ${existing[0]}" - for (( i=1; i<${#existing[@]}; i++ )); do - echo "Auto-removing: ${existing[$i]}" - if $DRY_RUN; then - echo "Dry-run: would remove ${existing[$i]}" - else - rm -rf "${existing[$i]}" - echo "Removed ${existing[$i]}" - fi + echo "Duplicate group (cleaned representative: ${group_rep[$key]:-unknown}):" + i=1 + for p in "${existing[@]}"; do + echo " [$i] $p" + ((i++)) done + echo -n "Enter the number of the directory you want to remove (or 0 to skip): " + read -r choice + if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then + dir_to_remove="${existing[$((choice-1))]}" + if $DRY_RUN; then + echo "Dry-run: would remove '$dir_to_remove'" + else + rm -rf "$dir_to_remove" + echo "Removed '$dir_to_remove'" + fi + else + echo "No removal selected for this group." + fi fi done echo "Script completed." + diff --git a/words b/words index 4f15897..3c00b30 100644 --- a/words +++ b/words @@ -1,5 +1,8 @@ dv -1080 +DV finnish +FINNISH norwegian +NORWEGIAN swedish +SWEDISH