first release
This commit is contained in:
parent
5a7fd15e08
commit
430076ad00
190
compare
Executable file
190
compare
Executable file
@ -0,0 +1,190 @@
|
||||
#!/bin/bash
|
||||
# compare_dirs.sh
|
||||
#
|
||||
# Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>
|
||||
#
|
||||
# This script:
|
||||
# - Scans immediate subdirectories in <dir1> and <dir2>
|
||||
# - Normalizes and “cleans” their names by removing punctuation, converting to lower-case,
|
||||
# trimming whitespace, and removing any words listed in <words_file>
|
||||
# - Groups directories using a fuzzy matching algorithm (with a configurable threshold)
|
||||
# - Automatically removes a directory from a duplicate group if its original name contains an
|
||||
# undesirable word while an alternative does not
|
||||
# - Then automatically removes any remaining duplicates in each group (keeping the first directory)
|
||||
# - Supports a --dry-run mode that shows actions without deleting directories
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Default options
|
||||
DRY_RUN=false
|
||||
SIMILARITY_THRESHOLD=0.8
|
||||
|
||||
# Process command-line flags
|
||||
while [[ "$1" == --* ]]; do
|
||||
case "$1" in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--threshold)
|
||||
SIMILARITY_THRESHOLD="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ "$#" -ne 3 ]; then
|
||||
echo "Usage: $0 [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DIR1="$1"
|
||||
DIR2="$2"
|
||||
WORDS_FILE="$3"
|
||||
|
||||
# Verify input paths
|
||||
if [ ! -d "$DIR1" ]; then
|
||||
echo "Error: Directory '$DIR1' not found."
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d "$DIR2" ]; then
|
||||
echo "Error: Directory '$DIR2' not found."
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f "$WORDS_FILE" ]; then
|
||||
echo "Error: Words file '$WORDS_FILE' not found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Read undesirable words (one per line) into an array, filtering out blank lines.
|
||||
mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")
|
||||
|
||||
# Function: Normalize and clean a directory name.
|
||||
clean_name() {
|
||||
local name="$1"
|
||||
# Normalize: convert to lower-case, remove punctuation, and trim extra whitespace.
|
||||
local normalized
|
||||
normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs)
|
||||
local cleaned="$normalized"
|
||||
for word in "${words[@]}"; do
|
||||
# Remove the word (case-insensitive).
|
||||
cleaned=$(echo "$cleaned" | sed "s/$(echo "$word" | tr '[:upper:]' '[:lower:]')//Ig")
|
||||
done
|
||||
# Trim again in case extra spaces were left.
|
||||
cleaned=$(echo "$cleaned" | xargs)
|
||||
echo "$cleaned"
|
||||
}
|
||||
|
||||
# Function: Compute fuzzy similarity between two names using Python's difflib.
|
||||
compute_similarity() {
|
||||
local name1="$1"
|
||||
local name2="$2"
|
||||
python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2"
|
||||
}
|
||||
|
||||
# Initialize grouping arrays explicitly.
|
||||
declare -a group_rep=() # Array for representative cleaned names.
|
||||
declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths.
|
||||
|
||||
# Process immediate subdirectories from both DIR1 and DIR2.
|
||||
for d in "$DIR1"/* "$DIR2"/*; do
|
||||
if [ -d "$d" ]; then
|
||||
base=$(basename "$d")
|
||||
cleaned=$(clean_name "$base")
|
||||
added=false
|
||||
# Compare against each existing group's representative.
|
||||
for i in "${!group_rep[@]}"; do
|
||||
rep="${group_rep[$i]}"
|
||||
sim=$(compute_similarity "$rep" "$cleaned")
|
||||
if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then
|
||||
groups["$i"]+=$'\n'"$d"
|
||||
added=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
# If no similar group found, create a new group.
|
||||
if [ "$added" = false ]; then
|
||||
new_index=${#group_rep[@]}
|
||||
group_rep+=("$cleaned")
|
||||
groups["$new_index"]="$d"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "=== Automatic Removal Based on Undesirable Words ==="
|
||||
# For each duplicate group, automatically remove directories whose original names contain
|
||||
# an undesirable word if at least one alternative in the group does not.
|
||||
for key in "${!groups[@]}"; do
|
||||
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
|
||||
if [ "${#paths[@]}" -gt 1 ]; then
|
||||
for path in "${paths[@]}"; do
|
||||
base=$(basename "$path")
|
||||
for word in "${words[@]}"; do
|
||||
if echo "$base" | grep -qi "$word"; then
|
||||
removal_candidate=false
|
||||
for other in "${paths[@]}"; do
|
||||
if [ "$other" != "$path" ]; then
|
||||
other_base=$(basename "$other")
|
||||
if ! echo "$other_base" | grep -qi "$word"; then
|
||||
removal_candidate=true
|
||||
break
|
||||
fi
|
||||
fi
|
||||
done
|
||||
if $removal_candidate; then
|
||||
echo "Candidate for auto-removal: $path (matches word: '$word')"
|
||||
if $DRY_RUN; then
|
||||
echo "Dry-run: would remove $path"
|
||||
else
|
||||
rm -rf "$path"
|
||||
echo "Removed $path"
|
||||
fi
|
||||
# Update the group by removing the candidate.
|
||||
new_group=()
|
||||
for p in "${paths[@]}"; do
|
||||
if [ "$p" != "$path" ]; then
|
||||
new_group+=("$p")
|
||||
fi
|
||||
done
|
||||
groups["$key"]=$(printf "%s\n" "${new_group[@]}")
|
||||
# Refresh the paths array.
|
||||
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
|
||||
break 2
|
||||
fi
|
||||
fi
|
||||
done
|
||||
done
|
||||
fi
|
||||
done
|
||||
|
||||
echo "=== Auto-Removing Remaining Duplicates (Keeping the First Entry) ==="
|
||||
# For any remaining duplicate groups, automatically remove all but the first directory.
|
||||
for key in "${!groups[@]}"; do
|
||||
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
|
||||
# Filter out directories that no longer exist.
|
||||
existing=()
|
||||
for path in "${paths[@]}"; do
|
||||
if [ -d "$path" ]; then
|
||||
existing+=("$path")
|
||||
fi
|
||||
done
|
||||
if [ "${#existing[@]}" -gt 1 ]; then
|
||||
echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}):"
|
||||
echo "Keeping: ${existing[0]}"
|
||||
for (( i=1; i<${#existing[@]}; i++ )); do
|
||||
echo "Auto-removing: ${existing[$i]}"
|
||||
if $DRY_RUN; then
|
||||
echo "Dry-run: would remove ${existing[$i]}"
|
||||
else
|
||||
rm -rf "${existing[$i]}"
|
||||
echo "Removed ${existing[$i]}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Script completed."
|
Loading…
x
Reference in New Issue
Block a user