cd data
## copy to as a new file and shuffle the training set
##cp Wiki10000lines Wiki10000lines_shuf
##cp tokenized_Wiki10000lines.txt Wiki10000lines_shuf
##gshuf Wiki10000lines_shuf
#shuf alldata > alldata_shuf
shuf tokenize_20newsgroup_all.txt > tokenize_20newsgroup_all_shuf
cd ..

make clean
make
./ste-d2vC -train ./data/tokenize_20newsgroup_all.txt -output vectors_kk_dc_10000lines -size 400 -window 10 -negative 8 -sample 1e-4 -threads 10 -binary 0 -out_iter 15 -in_iter 15 -min-count 5 -K 20 -sentence-sample 0.1 -test ./data/tokenize_20newsgroup_all_shuf -test-output docvectors.txt -save-vocab vocab.txt

#./ste-d2vC -train ./data/Wiki10000lines_shuf -output vectors_kk_dc_10000lines -size 300 -window 8 -negative 6 -sample 1e-4 -threads 10 -binary 0 -out_iter 5 -in_iter 5 -min-count 5 -K 10 -sentence-sample 0.1 -test ./data/Wiki10000lines -test-output docvectors.txt

head -n 50000 docvectors.txt | awk 'BEGIN{a=0;}{if (a<12500) printf "1 "; else printf "-1 "; for (b=1; b<=NF; b++) printf b ":" $(b) " "; print ""; a++;}' > train.txt
tail -n 50000 docvectors.txt | awk 'BEGIN{a=0;}{if (a<12500) printf "1 "; else printf "-1 "; for (b=1; b<=NF; b++) printf b ":" $(b) " "; print ""; a++;}' > test.txt
##head -n 100000 docvectors.txt | awk 'BEGIN{a=0;}{if (a<12500) printf "1 "; else printf "-1 "; for (b=1; b<=NF; b++) printf b ":" $(b) " "; print ""; a++;}' > train.txt
##tail -n 100000 docvectors.txt | awk 'BEGIN{a=0;}{if (a<12500) printf "1 "; else printf "-1 "; for (b=1; b<=NF; b++) printf b ":" $(b) " "; print ""; a++;}' > test.txt
##cp train.txt test.txt libsvm
cp train.txt test.txt liblinear
##cd libsvm
cd liblinear
make
export PATH=$PATH:/opt/ste-avg-d2vC/liblinear
##export PATH=$PATH:/opt/ste-avg-d2vC/libsvm
./train -s 0 ./train.txt model.logreg
./predict -b 1 ./test.txt model.logreg out.logreg
##./svm-train -s 0 -h 0 -b 1 ./train.txt model.logreg
##./svm-predict -b 1 ./test.txt model.logreg out.logreg
cd ..
