word list corrected

2025-02-25 14:35:18 +01:00 · 2025-02-25 14:35:18 +01:00 · 1c37c870fe
commit 1c37c870fe
parent 430076ad00
2 changed files with 87 additions and 100 deletions
--- a/152
+++ b/152
@ -4,14 +4,13 @@
 # Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>
 #
 # This script:
-#   - Scans immediate subdirectories in <dir1> and <dir2>
-#   - Normalizes and “cleans” their names by removing punctuation, converting to lower-case,
-#     trimming whitespace, and removing any words listed in <words_file>
-#   - Groups directories using a fuzzy matching algorithm (with a configurable threshold)
-#   - Automatically removes a directory from a duplicate group if its original name contains an
-#     undesirable word while an alternative does not
-#   - Then automatically removes any remaining duplicates in each group (keeping the first directory)
-#   - Supports a --dry-run mode that shows actions without deleting directories
+#   1. Scans immediate subdirectories in <dir1> and <dir2>.
+#   2. For each directory, if its name contains any undesirable word (one per line in <words_file>),
+#      the directory is removed outright.
+#   3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
+#      and then grouped by fuzzy similarity using a configurable threshold.
+#   4. If duplicate groups remain, you are prompted to choose which duplicate to remove.
+#   5. A dry-run mode (--dry-run) is available to show what would be removed without deleting.

 set -euo pipefail

@ -60,43 +59,66 @@ if [ ! -f "$WORDS_FILE" ]; then
    exit 1
 fi

-# Read undesirable words (one per line) into an array, filtering out blank lines.
+# Read undesirable words (one per line) into an array (ignoring blank lines)
 mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")

+echo "=== Pre-filtering Directories by Undesirable Words ==="
+# Create an array to hold directories that do NOT match any undesirable word.
+filtered_dirs=()
+
+# Loop over immediate subdirectories in both directories.
+for d in "$DIR1"/* "$DIR2"/*; do
+    if [ -d "$d" ]; then
+        base=$(basename "$d")
+        remove_flag=false
+        # Check if the directory name contains any undesirable word (case-insensitive).
+        for word in "${words[@]}"; do
+            if echo "$base" | grep -qi "$word"; then
+                remove_flag=true
+                break
+            fi
+        done
+        if $remove_flag; then
+            echo "Removing '$d' because it contains an undesirable word."
+            if $DRY_RUN; then
+                echo "Dry-run: would remove '$d'"
+            else
+                rm -rf "$d"
+                echo "Removed '$d'"
+            fi
+        else
+            filtered_dirs+=("$d")
+        fi
+    fi
+done
+
 # Function: Normalize and clean a directory name.
 clean_name() {
    local name="$1"
-    # Normalize: convert to lower-case, remove punctuation, and trim extra whitespace.
+    # Normalize: convert to lowercase, remove punctuation, and trim extra whitespace.
    local normalized
    normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs)
-    local cleaned="$normalized"
-    for word in "${words[@]}"; do
-        # Remove the word (case-insensitive).
-        cleaned=$(echo "$cleaned" | sed "s/$(echo "$word" | tr '[:upper:]' '[:lower:]')//Ig")
-    done
-    # Trim again in case extra spaces were left.
-    cleaned=$(echo "$cleaned" | xargs)
-    echo "$cleaned"
+    echo "$normalized"
 }

-# Function: Compute fuzzy similarity between two names using Python's difflib.
+# Function: Compute fuzzy similarity between two strings using Python's difflib.
 compute_similarity() {
    local name1="$1"
    local name2="$2"
    python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2"
 }

-# Initialize grouping arrays explicitly.
+echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
+# Initialize grouping arrays.
 declare -a group_rep=()   # Array for representative cleaned names.
 declare -A groups=()      # Associative array: groups[i] holds newline-separated directory paths.

-# Process immediate subdirectories from both DIR1 and DIR2.
-for d in "$DIR1"/* "$DIR2"/*; do
-    if [ -d "$d" ]; then
+# Group the directories in filtered_dirs.
+for d in "${filtered_dirs[@]}"; do
    base=$(basename "$d")
    cleaned=$(clean_name "$base")
    added=false
-        # Compare against each existing group's representative.
+    # Compare with each existing group's representative.
    for i in "${!group_rep[@]}"; do
        rep="${group_rep[$i]}"
        sim=$(compute_similarity "$rep" "$cleaned")
@ -106,63 +128,16 @@ for d in "$DIR1"/* "$DIR2"/*; do
            break
        fi
    done
-        # If no similar group found, create a new group.
+    # If not added to an existing group, create a new group.
    if [ "$added" = false ]; then
        new_index=${#group_rep[@]}
        group_rep+=("$cleaned")
        groups["$new_index"]="$d"
    fi
-    fi
 done

-echo "=== Automatic Removal Based on Undesirable Words ==="
-# For each duplicate group, automatically remove directories whose original names contain
-# an undesirable word if at least one alternative in the group does not.
-for key in "${!groups[@]}"; do
-    IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
-    if [ "${#paths[@]}" -gt 1 ]; then
-        for path in "${paths[@]}"; do
-            base=$(basename "$path")
-            for word in "${words[@]}"; do
-                if echo "$base" | grep -qi "$word"; then
-                    removal_candidate=false
-                    for other in "${paths[@]}"; do
-                        if [ "$other" != "$path" ]; then
-                            other_base=$(basename "$other")
-                            if ! echo "$other_base" | grep -qi "$word"; then
-                                removal_candidate=true
-                                break
-                            fi
-                        fi
-                    done
-                    if $removal_candidate; then
-                        echo "Candidate for auto-removal: $path (matches word: '$word')"
-                        if $DRY_RUN; then
-                            echo "Dry-run: would remove $path"
-                        else
-                            rm -rf "$path"
-                            echo "Removed $path"
-                        fi
-                        # Update the group by removing the candidate.
-                        new_group=()
-                        for p in "${paths[@]}"; do
-                            if [ "$p" != "$path" ]; then
-                                new_group+=("$p")
-                            fi
-                        done
-                        groups["$key"]=$(printf "%s\n" "${new_group[@]}")
-                        # Refresh the paths array.
-                        IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
-                        break 2
-                    fi
-                fi
-            done
-        done
-    fi
-done
-
-echo "=== Auto-Removing Remaining Duplicates (Keeping the First Entry) ==="
-# For any remaining duplicate groups, automatically remove all but the first directory.
+echo "=== Interactive Duplicate Resolution ==="
+# For each group that has more than one directory, prompt the user to select one to remove.
 for key in "${!groups[@]}"; do
    IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
    # Filter out directories that no longer exist.
@ -173,18 +148,27 @@ for key in "${!groups[@]}"; do
        fi
    done
    if [ "${#existing[@]}" -gt 1 ]; then
-        echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}):"
-        echo "Keeping: ${existing[0]}"
-        for (( i=1; i<${#existing[@]}; i++ )); do
-            echo "Auto-removing: ${existing[$i]}"
-            if $DRY_RUN; then
-                echo "Dry-run: would remove ${existing[$i]}"
-            else
-                rm -rf "${existing[$i]}"
-                echo "Removed ${existing[$i]}"
-            fi
+        echo "Duplicate group (cleaned representative: ${group_rep[$key]:-unknown}):"
+        i=1
+        for p in "${existing[@]}"; do
+            echo "  [$i] $p"
+            ((i++))
        done
+        echo -n "Enter the number of the directory you want to remove (or 0 to skip): "
+        read -r choice
+        if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then
+            dir_to_remove="${existing[$((choice-1))]}"
+            if $DRY_RUN; then
+                echo "Dry-run: would remove '$dir_to_remove'"
+            else
+                rm -rf "$dir_to_remove"
+                echo "Removed '$dir_to_remove'"
+            fi
+        else
+            echo "No removal selected for this group."
+        fi
    fi
 done

 echo "Script completed."
+
--- a/5
+++ b/5
@ -1,5 +1,8 @@
 dv
-1080
+DV
 finnish
+FINNISH
 norwegian
+NORWEGIAN
 swedish
+SWEDISH