#!/bin/bash

cd data

PROCESS_DIR=webbase_processed
FINAL_DIR=corpus

wget http://ebiquity.umbc.edu/share/umbc_webbase_corpus.tgz

tar -xvf umbc_webbase_corpus.tgz

mkdir $PROCESS_DIR
mkdir $FINAL_DIR

python umbc_corpus_prepare.py

cat ${PROCESS_DIR}/delorme.com_shu.pages_*.txt | head -n 10000000 > ${PROCESS_DIR}/delorme_init
cat ${PROCESS_DIR}/mbta.com_mtu.pages_*.txt | head -n 5000000 > ${PROCESS_DIR}/mbta_init
cat ${PROCESS_DIR}/ucdavis_wnba.pages_*.txt | head -n 5000000 > ${PROCESS_DIR}/ucdavis_init
cat ${PROCESS_DIR}/utexas_iit.pages_*.txt | head -n 5000000 > ${PROCESS_DIR}/utexas_init
cat ${PROCESS_DIR}/weather.yahoo_bbk.ac.pages_*.txt | head -n 5000000 > ${PROCESS_DIR}/weather_init

cat ${PROCESS_DIR}/*_init > ${FINAL_DIR}/umbc30m
