word list corrected

This commit is contained in:
masterdraco 2025-02-25 14:35:18 +01:00
parent 430076ad00
commit 1c37c870fe
2 changed files with 87 additions and 100 deletions

182
compare
View File

@ -4,14 +4,13 @@
# Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file> # Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>
# #
# This script: # This script:
# - Scans immediate subdirectories in <dir1> and <dir2> # 1. Scans immediate subdirectories in <dir1> and <dir2>.
# - Normalizes and “cleans” their names by removing punctuation, converting to lower-case, # 2. For each directory, if its name contains any undesirable word (one per line in <words_file>),
# trimming whitespace, and removing any words listed in <words_file> # the directory is removed outright.
# - Groups directories using a fuzzy matching algorithm (with a configurable threshold) # 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
# - Automatically removes a directory from a duplicate group if its original name contains an # and then grouped by fuzzy similarity using a configurable threshold.
# undesirable word while an alternative does not # 4. If duplicate groups remain, you are prompted to choose which duplicate to remove.
# - Then automatically removes any remaining duplicates in each group (keeping the first directory) # 5. A dry-run mode (--dry-run) is available to show what would be removed without deleting.
# - Supports a --dry-run mode that shows actions without deleting directories
set -euo pipefail set -euo pipefail
@ -60,109 +59,85 @@ if [ ! -f "$WORDS_FILE" ]; then
exit 1 exit 1
fi fi
# Read undesirable words (one per line) into an array, filtering out blank lines. # Read undesirable words (one per line) into an array (ignoring blank lines)
mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE") mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")
echo "=== Pre-filtering Directories by Undesirable Words ==="
# Create an array to hold directories that do NOT match any undesirable word.
filtered_dirs=()
# Loop over immediate subdirectories in both directories.
for d in "$DIR1"/* "$DIR2"/*; do
if [ -d "$d" ]; then
base=$(basename "$d")
remove_flag=false
# Check if the directory name contains any undesirable word (case-insensitive).
for word in "${words[@]}"; do
if echo "$base" | grep -qi "$word"; then
remove_flag=true
break
fi
done
if $remove_flag; then
echo "Removing '$d' because it contains an undesirable word."
if $DRY_RUN; then
echo "Dry-run: would remove '$d'"
else
rm -rf "$d"
echo "Removed '$d'"
fi
else
filtered_dirs+=("$d")
fi
fi
done
# Function: Normalize and clean a directory name. # Function: Normalize and clean a directory name.
clean_name() { clean_name() {
local name="$1" local name="$1"
# Normalize: convert to lower-case, remove punctuation, and trim extra whitespace. # Normalize: convert to lowercase, remove punctuation, and trim extra whitespace.
local normalized local normalized
normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs) normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs)
local cleaned="$normalized" echo "$normalized"
for word in "${words[@]}"; do
# Remove the word (case-insensitive).
cleaned=$(echo "$cleaned" | sed "s/$(echo "$word" | tr '[:upper:]' '[:lower:]')//Ig")
done
# Trim again in case extra spaces were left.
cleaned=$(echo "$cleaned" | xargs)
echo "$cleaned"
} }
# Function: Compute fuzzy similarity between two names using Python's difflib. # Function: Compute fuzzy similarity between two strings using Python's difflib.
compute_similarity() { compute_similarity() {
local name1="$1" local name1="$1"
local name2="$2" local name2="$2"
python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2" python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2"
} }
# Initialize grouping arrays explicitly. echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
# Initialize grouping arrays.
declare -a group_rep=() # Array for representative cleaned names. declare -a group_rep=() # Array for representative cleaned names.
declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths. declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths.
# Process immediate subdirectories from both DIR1 and DIR2. # Group the directories in filtered_dirs.
for d in "$DIR1"/* "$DIR2"/*; do for d in "${filtered_dirs[@]}"; do
if [ -d "$d" ]; then base=$(basename "$d")
base=$(basename "$d") cleaned=$(clean_name "$base")
cleaned=$(clean_name "$base") added=false
added=false # Compare with each existing group's representative.
# Compare against each existing group's representative. for i in "${!group_rep[@]}"; do
for i in "${!group_rep[@]}"; do rep="${group_rep[$i]}"
rep="${group_rep[$i]}" sim=$(compute_similarity "$rep" "$cleaned")
sim=$(compute_similarity "$rep" "$cleaned") if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then
if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then groups["$i"]+=$'\n'"$d"
groups["$i"]+=$'\n'"$d" added=true
added=true break
break
fi
done
# If no similar group found, create a new group.
if [ "$added" = false ]; then
new_index=${#group_rep[@]}
group_rep+=("$cleaned")
groups["$new_index"]="$d"
fi fi
done
# If not added to an existing group, create a new group.
if [ "$added" = false ]; then
new_index=${#group_rep[@]}
group_rep+=("$cleaned")
groups["$new_index"]="$d"
fi fi
done done
echo "=== Automatic Removal Based on Undesirable Words ===" echo "=== Interactive Duplicate Resolution ==="
# For each duplicate group, automatically remove directories whose original names contain # For each group that has more than one directory, prompt the user to select one to remove.
# an undesirable word if at least one alternative in the group does not.
for key in "${!groups[@]}"; do
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
if [ "${#paths[@]}" -gt 1 ]; then
for path in "${paths[@]}"; do
base=$(basename "$path")
for word in "${words[@]}"; do
if echo "$base" | grep -qi "$word"; then
removal_candidate=false
for other in "${paths[@]}"; do
if [ "$other" != "$path" ]; then
other_base=$(basename "$other")
if ! echo "$other_base" | grep -qi "$word"; then
removal_candidate=true
break
fi
fi
done
if $removal_candidate; then
echo "Candidate for auto-removal: $path (matches word: '$word')"
if $DRY_RUN; then
echo "Dry-run: would remove $path"
else
rm -rf "$path"
echo "Removed $path"
fi
# Update the group by removing the candidate.
new_group=()
for p in "${paths[@]}"; do
if [ "$p" != "$path" ]; then
new_group+=("$p")
fi
done
groups["$key"]=$(printf "%s\n" "${new_group[@]}")
# Refresh the paths array.
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
break 2
fi
fi
done
done
fi
done
echo "=== Auto-Removing Remaining Duplicates (Keeping the First Entry) ==="
# For any remaining duplicate groups, automatically remove all but the first directory.
for key in "${!groups[@]}"; do for key in "${!groups[@]}"; do
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
# Filter out directories that no longer exist. # Filter out directories that no longer exist.
@ -173,18 +148,27 @@ for key in "${!groups[@]}"; do
fi fi
done done
if [ "${#existing[@]}" -gt 1 ]; then if [ "${#existing[@]}" -gt 1 ]; then
echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}):" echo "Duplicate group (cleaned representative: ${group_rep[$key]:-unknown}):"
echo "Keeping: ${existing[0]}" i=1
for (( i=1; i<${#existing[@]}; i++ )); do for p in "${existing[@]}"; do
echo "Auto-removing: ${existing[$i]}" echo " [$i] $p"
if $DRY_RUN; then ((i++))
echo "Dry-run: would remove ${existing[$i]}"
else
rm -rf "${existing[$i]}"
echo "Removed ${existing[$i]}"
fi
done done
echo -n "Enter the number of the directory you want to remove (or 0 to skip): "
read -r choice
if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then
dir_to_remove="${existing[$((choice-1))]}"
if $DRY_RUN; then
echo "Dry-run: would remove '$dir_to_remove'"
else
rm -rf "$dir_to_remove"
echo "Removed '$dir_to_remove'"
fi
else
echo "No removal selected for this group."
fi
fi fi
done done
echo "Script completed." echo "Script completed."

5
words
View File

@ -1,5 +1,8 @@
dv dv
1080 DV
finnish finnish
FINNISH
norwegian norwegian
NORWEGIAN
swedish swedish
SWEDISH