summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorilotterytea <iltsu@alright.party>2025-07-05 10:01:29 +0500
committerilotterytea <iltsu@alright.party>2025-07-05 10:01:29 +0500
commit53545015c2ab7fd6bed82e9fcb3261e80dab61a3 (patch)
tree662b3c7d5bcce292e8de9f181e71c586d82d6235
parentaf1648420b1a343aa95d3cd3abe89c5283c7aaba (diff)
feat: a script for training the modelHEADmaster
-rwxr-xr-xtrain.sh61
1 files changed, 61 insertions, 0 deletions
diff --git a/train.sh b/train.sh
new file mode 100755
index 0000000..cc172b2
--- /dev/null
+++ b/train.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+if [ -z "$1" ]; then
+ echo "Markov chain generator."
+ echo "Usage: $0 filename.txt"
+ exit 1
+fi
+
+line_count=$(wc -l "$1")
+line_count="${line_count%% *}"
+
+declare -A counts
+declare -A totals
+
+current_line=0
+prev_word=""
+while read -r line; do
+ current_line=$(($current_line + 1))
+ for word in $line; do
+ if [[ -z "$word" ]]; then
+ continue
+ fi
+
+ if [ -n "$prev_word" ]; then
+ counts["$prev_word,$word"]=$((counts["$prev_word,$word"] + 1))
+ totals["$prev_word"]=$((totals["$prev_word"] + 1))
+ fi
+ prev_word="$word"
+ done
+ echo "Parsing... ($current_line/$line_count)"
+done < "$1"
+
+output_file="output.sql"
+origin_name="$1"
+
+> "$output_file"
+
+echo "Packing up..."
+
+escape_sql() {
+ local s="$1"
+ s="${s//\'/\'\'}"
+ s="${s//\\/\\\\}"
+ echo "$s"
+}
+
+current_line=0
+line_count=${#counts[@]}
+
+for key in "${!counts[@]}"; do
+ current_line=$((current_line + 1))
+ IFS=',' read -r from to <<< "$key"
+ efrom=$(escape_sql "$from")
+ eto=$(escape_sql "$to")
+ ename=$(escape_sql "$origin_name")
+ weight=${counts["$key"]}
+ echo "INSERT INTO chains(origin_name, from_word, to_word, weight) VALUES ('$ename', '$efrom', '$eto', $weight);" >> "$output_file"
+ echo "Packing up... ($current_line/$line_count)"
+done
+
+echo "Done! Markov chain output saved to '$output_file'! Feed your database with it."