word list corrected

2025-02-25 14:35:18 +01:00
parent 430076ad00
commit 1c37c870fe
2 changed files with 87 additions and 100 deletions
--- a/182
+++ b/182
@@ -4,14 +4,13 @@
 # Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>
 #
 # This script:
-#   - Scans immediate subdirectories in <dir1> and <dir2>
+#   1. Scans immediate subdirectories in <dir1> and <dir2>.
-#   - Normalizes and “cleans” their names by removing punctuation, converting to lower-case,
+#   2. For each directory, if its name contains any undesirable word (one per line in <words_file>),
-#     trimming whitespace, and removing any words listed in <words_file>
+#      the directory is removed outright.
-#   - Groups directories using a fuzzy matching algorithm (with a configurable threshold)
+#   3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
-#   - Automatically removes a directory from a duplicate group if its original name contains an
+#      and then grouped by fuzzy similarity using a configurable threshold.
-#     undesirable word while an alternative does not
+#   4. If duplicate groups remain, you are prompted to choose which duplicate to remove.
-#   - Then automatically removes any remaining duplicates in each group (keeping the first directory)
+#   5. A dry-run mode (--dry-run) is available to show what would be removed without deleting.
 #   - Supports a --dry-run mode that shows actions without deleting directories
 set -euo pipefail
@@ -60,109 +59,85 @@ if [ ! -f "$WORDS_FILE" ]; then
    exit 1
 fi
-# Read undesirable words (one per line) into an array, filtering out blank lines.
+# Read undesirable words (one per line) into an array (ignoring blank lines)
 mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")
 echo "=== Pre-filtering Directories by Undesirable Words ==="
 # Create an array to hold directories that do NOT match any undesirable word.
 filtered_dirs=()
 # Loop over immediate subdirectories in both directories.
 for d in "$DIR1"/* "$DIR2"/*; do
    if [ -d "$d" ]; then
        base=$(basename "$d")
        remove_flag=false
        # Check if the directory name contains any undesirable word (case-insensitive).
        for word in "${words[@]}"; do
            if echo "$base" | grep -qi "$word"; then
                remove_flag=true
                break
            fi
        done
        if $remove_flag; then
            echo "Removing '$d' because it contains an undesirable word."
            if $DRY_RUN; then
                echo "Dry-run: would remove '$d'"
            else
                rm -rf "$d"
                echo "Removed '$d'"
            fi
        else
            filtered_dirs+=("$d")
        fi
    fi
 done
 # Function: Normalize and clean a directory name.
 clean_name() {
    local name="$1"
-    # Normalize: convert to lower-case, remove punctuation, and trim extra whitespace.
+    # Normalize: convert to lowercase, remove punctuation, and trim extra whitespace.
    local normalized
    normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs)
-    local cleaned="$normalized"
+    echo "$normalized"
    for word in "${words[@]}"; do
        # Remove the word (case-insensitive).
        cleaned=$(echo "$cleaned" | sed "s/$(echo "$word" | tr '[:upper:]' '[:lower:]')//Ig")
    done
    # Trim again in case extra spaces were left.
    cleaned=$(echo "$cleaned" | xargs)
    echo "$cleaned"
 }
-# Function: Compute fuzzy similarity between two names using Python's difflib.
+# Function: Compute fuzzy similarity between two strings using Python's difflib.
 compute_similarity() {
    local name1="$1"
    local name2="$2"
    python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2"
 }
-# Initialize grouping arrays explicitly.
+echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
 # Initialize grouping arrays.
 declare -a group_rep=()   # Array for representative cleaned names.
 declare -A groups=()      # Associative array: groups[i] holds newline-separated directory paths.
-# Process immediate subdirectories from both DIR1 and DIR2.
+# Group the directories in filtered_dirs.
-for d in "$DIR1"/* "$DIR2"/*; do
+for d in "${filtered_dirs[@]}"; do
-    if [ -d "$d" ]; then
+    base=$(basename "$d")
-        base=$(basename "$d")
+    cleaned=$(clean_name "$base")
-        cleaned=$(clean_name "$base")
+    added=false
-        added=false
+    # Compare with each existing group's representative.
-        # Compare against each existing group's representative.
+    for i in "${!group_rep[@]}"; do
-        for i in "${!group_rep[@]}"; do
+        rep="${group_rep[$i]}"
-            rep="${group_rep[$i]}"
+        sim=$(compute_similarity "$rep" "$cleaned")
-            sim=$(compute_similarity "$rep" "$cleaned")
+        if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then
-            if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then
+            groups["$i"]+=$'\n'"$d"
-                groups["$i"]+=$'\n'"$d"
+            added=true
-                added=true
+            break
                break
            fi
        done
        # If no similar group found, create a new group.
        if [ "$added" = false ]; then
            new_index=${#group_rep[@]}
            group_rep+=("$cleaned")
            groups["$new_index"]="$d"
        fi
    done
    # If not added to an existing group, create a new group.
    if [ "$added" = false ]; then
        new_index=${#group_rep[@]}
        group_rep+=("$cleaned")
        groups["$new_index"]="$d"
    fi
 done
-echo "=== Automatic Removal Based on Undesirable Words ==="
+echo "=== Interactive Duplicate Resolution ==="
-# For each duplicate group, automatically remove directories whose original names contain
+# For each group that has more than one directory, prompt the user to select one to remove.
 # an undesirable word if at least one alternative in the group does not.
 for key in "${!groups[@]}"; do
    IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
    if [ "${#paths[@]}" -gt 1 ]; then
        for path in "${paths[@]}"; do
            base=$(basename "$path")
            for word in "${words[@]}"; do
                if echo "$base" | grep -qi "$word"; then
                    removal_candidate=false
                    for other in "${paths[@]}"; do
                        if [ "$other" != "$path" ]; then
                            other_base=$(basename "$other")
                            if ! echo "$other_base" | grep -qi "$word"; then
                                removal_candidate=true
                                break
                            fi
                        fi
                    done
                    if $removal_candidate; then
                        echo "Candidate for auto-removal: $path (matches word: '$word')"
                        if $DRY_RUN; then
                            echo "Dry-run: would remove $path"
                        else
                            rm -rf "$path"
                            echo "Removed $path"
                        fi
                        # Update the group by removing the candidate.
                        new_group=()
                        for p in "${paths[@]}"; do
                            if [ "$p" != "$path" ]; then
                                new_group+=("$p")
                            fi
                        done
                        groups["$key"]=$(printf "%s\n" "${new_group[@]}")
                        # Refresh the paths array.
                        IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
                        break 2
                    fi
                fi
            done
        done
    fi
 done
 echo "=== Auto-Removing Remaining Duplicates (Keeping the First Entry) ==="
 # For any remaining duplicate groups, automatically remove all but the first directory.
 for key in "${!groups[@]}"; do
    IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
    # Filter out directories that no longer exist.
@@ -173,18 +148,27 @@ for key in "${!groups[@]}"; do
        fi
    done
    if [ "${#existing[@]}" -gt 1 ]; then
-        echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}):"
+        echo "Duplicate group (cleaned representative: ${group_rep[$key]:-unknown}):"
-        echo "Keeping: ${existing[0]}"
+        i=1
-        for (( i=1; i<${#existing[@]}; i++ )); do
+        for p in "${existing[@]}"; do
-            echo "Auto-removing: ${existing[$i]}"
+            echo "  [$i] $p"
-            if $DRY_RUN; then
+            ((i++))
                echo "Dry-run: would remove ${existing[$i]}"
            else
                rm -rf "${existing[$i]}"
                echo "Removed ${existing[$i]}"
            fi
        done
        echo -n "Enter the number of the directory you want to remove (or 0 to skip): "
        read -r choice
        if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then
            dir_to_remove="${existing[$((choice-1))]}"
            if $DRY_RUN; then
                echo "Dry-run: would remove '$dir_to_remove'"
            else
                rm -rf "$dir_to_remove"
                echo "Removed '$dir_to_remove'"
            fi
        else
            echo "No removal selected for this group."
        fi
    fi
 done
 echo "Script completed."
--- a/5
+++ b/5
@@ -1,5 +1,8 @@
 dv
-1080
+DV
 finnish
 FINNISH
 norwegian
 NORWEGIAN
 swedish
 SWEDISH