name change and multiprocessing the fuzze operation
This commit is contained in:
parent
9e6961c406
commit
56d41fb487
@ -9,7 +9,8 @@
|
|||||||
# the directory is removed outright.
|
# the directory is removed outright.
|
||||||
# 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
|
# 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
|
||||||
# and then grouped by fuzzy similarity using a configurable threshold.
|
# and then grouped by fuzzy similarity using a configurable threshold.
|
||||||
# 4. Within each group, if at least one directory contains "2160p" and one contains "1080p",
|
# The fuzzy similarity process is optimized with a multiprocessing helper.
|
||||||
|
# 4. Within each group, if one directory’s name contains "2160p" and another contains "1080p",
|
||||||
# the 1080p directory(ies) are removed (or flagged in dry-run mode).
|
# the 1080p directory(ies) are removed (or flagged in dry-run mode).
|
||||||
# 5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
|
# 5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
|
||||||
# 6. A --dry-run mode is available to preview removals without actually deleting any directories.
|
# 6. A --dry-run mode is available to preview removals without actually deleting any directories.
|
||||||
@ -97,15 +98,27 @@ done
|
|||||||
# Function: Normalize and clean a directory name.
|
# Function: Normalize and clean a directory name.
|
||||||
clean_name() {
|
clean_name() {
|
||||||
local name="$1"
|
local name="$1"
|
||||||
# Normalize: convert to lowercase, remove punctuation, and trim extra whitespace.
|
|
||||||
echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
|
echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
|
||||||
}
|
}
|
||||||
|
|
||||||
# Function: Compute fuzzy similarity between two names using Python's difflib.
|
# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
|
||||||
compute_similarity() {
|
compute_similarities() {
|
||||||
local name1="$1"
|
local target="$1"
|
||||||
local name2="$2"
|
shift
|
||||||
python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2"
|
# Pass target and the list of representatives as command-line arguments to Python.
|
||||||
|
python3 - "$target" "$@" <<EOF
|
||||||
|
import sys
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
target = sys.argv[1]
|
||||||
|
reps = sys.argv[2:]
|
||||||
|
def similarity(rep):
|
||||||
|
return SequenceMatcher(None, target, rep).ratio()
|
||||||
|
with Pool() as pool:
|
||||||
|
results = pool.map(similarity, reps)
|
||||||
|
print(" ".join(map(str, results)))
|
||||||
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
|
echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
|
||||||
@ -118,17 +131,18 @@ for d in "${filtered_dirs[@]}"; do
|
|||||||
base=$(basename "$d")
|
base=$(basename "$d")
|
||||||
cleaned=$(clean_name "$base")
|
cleaned=$(clean_name "$base")
|
||||||
added=false
|
added=false
|
||||||
# Compare with each existing group's representative.
|
if [ "${#group_rep[@]}" -gt 0 ]; then
|
||||||
for i in "${!group_rep[@]}"; do
|
# Compute similarities between the cleaned name and all group representatives concurrently.
|
||||||
rep="${group_rep[$i]}"
|
similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
|
||||||
sim=$(compute_similarity "$rep" "$cleaned")
|
read -r -a sims <<< "$similarities"
|
||||||
if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then
|
for i in "${!sims[@]}"; do
|
||||||
|
if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
|
||||||
groups["$i"]+=$'\n'"$d"
|
groups["$i"]+=$'\n'"$d"
|
||||||
added=true
|
added=true
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
# If not added to an existing group, create a new group.
|
fi
|
||||||
if [ "$added" = false ]; then
|
if [ "$added" = false ]; then
|
||||||
new_index=${#group_rep[@]}
|
new_index=${#group_rep[@]}
|
||||||
group_rep+=("$cleaned")
|
group_rep+=("$cleaned")
|
||||||
@ -153,7 +167,7 @@ for key in "${!groups[@]}"; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
if $has_2160p && $has_1080p; then
|
if $has_2160p && $has_1080p; then
|
||||||
echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
|
echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
|
||||||
new_group=()
|
new_group=()
|
||||||
for path in "${paths[@]}"; do
|
for path in "${paths[@]}"; do
|
||||||
base=$(basename "$path")
|
base=$(basename "$path")
|
||||||
@ -185,7 +199,7 @@ for key in "${!groups[@]}"; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
if [ "${#existing[@]}" -gt 1 ]; then
|
if [ "${#existing[@]}" -gt 1 ]; then
|
||||||
echo "Duplicate group (representative cleaned name: ${group_rep[$key]:-unknown}):"
|
echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
|
||||||
i=1
|
i=1
|
||||||
for p in "${existing[@]}"; do
|
for p in "${existing[@]}"; do
|
||||||
echo " [$i] $p"
|
echo " [$i] $p"
|
Loading…
x
Reference in New Issue
Block a user