git clone git@github.com:yanonono/booth-update.git
for file in `find ./booth-update -type f -name '*.html'`; do
cat $file | tr -d "\n" | sed -e "s/<[^>]*>//g" -e "s/ダウンロード商品//g" -e "s/カートに入れる//g" | sed -r -e 's/ +/ /g' >> japanese.txt
echo "\n" >> japanese.txt
done
mecab -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd -Owakati japanese.txt -b 24576 > wakati.txt
./fasttext skipgram -input ../wakati.txt -output model -dim 300 -neg 25 -ws 8
Last active
May 22, 2019 14:39
-
-
Save yanonono/6863cb31670407447532e9232164162a to your computer and use it in GitHub Desktop.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment