// Copyright<2016>
// contact: bshi@se.cuhk.edu.hk

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <pthread.h>
#include <time.h> // added to support clock_t

#define MAX_STRING 100
#define EXP_TABLE_SIZE 1000
#define MAX_EXP 6
#define MAX_SENTENCE_LENGTH 100000
#define MAX_DOCUMENTS 4000000
#define MAX_SENTENCE_SAMPLE 100// add for d2vC
//#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) //needed to run on window to fix posix_memalign errors.
// Maximum 30 * 0.7 = 21M words in the vocabulary
const int vocab_hash_size = 30000000;

typedef float real;  // Precision of float numbers

struct vocab_word {
  long long cn;
  char *word;
};

struct doc {
  char *content;
  long long length;
  long long* word_list;
};
/*
struct test_doc {
  char *test_content;
  long long test_length;
  long long* test_word_list;
};
*/
char train_file[MAX_STRING], output_file[MAX_STRING], test_file[MAX_STRING], test_output_file[MAX_STRING];
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
struct vocab_word *vocab;
struct doc *doc_list;
struct doc *test_doc_list;
int binary = 0, debug_mode = 2, window = 5,
  min_count = 5, num_threads = 12, min_reduce = 5;
int *vocab_hash;
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100, num_topics = 10;
long long train_words = 0, test_words = 0, word_count_actual = 0, out_iter = 3, in_iter = 5, file_size = 0, classes = 0, train_docs = 0, test_docs = 0;
long long *line_size = 0;
real alpha = 0.025, starting_alpha, sample = 1e-3;
real *syn0, *syn1, *syn1neg, *expTable;
real rp_sample=0.1;
// plsa arguments
real *p_z_d;
clock_t start;

int negative = 5;
const int table_size = 1e8;
int *table;

void InitUnigramTable() {
  int a, i;
  double train_words_pow = 0;
  double d1, power = 0.75;
  table = (int *)malloc(table_size * sizeof(int));
  for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
  i = 0;
  d1 = pow(vocab[i].cn, power) / train_words_pow;
  for (a = 0; a < table_size; a++) {
    table[a] = i;
    if (a / (double)table_size > d1) {
      i++;
      d1 += pow(vocab[i].cn, power) / train_words_pow;
    }
    if (i >= vocab_size) i = vocab_size - 1;
  }
}

int SearchVocab(char *word);

// Read a whole sentence from a file
void ReadSentence(char *sentence, FILE *fin) {
  int a = 0, ch;
  while (!feof(fin)) {
    ch = fgetc(fin);
    if (ch == '\n') {
      if (a > 0) {
        sentence[a] = 0;
        return;
      } else {
        continue;
      }
    }
    sentence[a] = ch;
    a++;
    if (a >= MAX_SENTENCE_LENGTH - 1) a--; // Truncate too long document
  }
}

// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
void ReadWord(char *word, FILE *fin) {
  int a = 0, ch;
  while (!feof(fin)) {
    ch = fgetc(fin);
    if (ch == 13) continue;
    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
      if (a > 0) {
        if (ch == '\n') ungetc(ch, fin);
        break;
      }
      if (ch == '\n') {
        strcpy(word, (char *)"</s>");
        return;
      } else continue;
    }
    word[a] = ch;
    a++;
    if (a >= MAX_STRING - 1) a--;   // Truncate too long words
  }
  word[a] = 0;
}

// Returns hash value of a word
int GetWordHash(char *word) {
  unsigned long long a, hash = 0;
  for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
  hash = hash % vocab_hash_size;
  return hash;
}

// Returns position of a word in the vocabulary; if the word is not found, returns -1
int SearchVocab(char *word) {
  unsigned int hash = GetWordHash(word);
  while (1) {
    if (vocab_hash[hash] == -1) return -1;
    if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
    hash = (hash + 1) % vocab_hash_size;
  }
  return -1;
}

// Reads a word and returns its index in the vocabulary
int ReadWordIndex(FILE *fin) {
  char word[MAX_STRING];
  ReadWord(word, fin);
  if (feof(fin)) return -1;
  return SearchVocab(word);
}

// Adds a word to the vocabulary
int AddWordToVocab(char *word) {
  unsigned int hash, length = strlen(word) + 1;
  if (length > MAX_STRING) length = MAX_STRING;
  vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
  strcpy(vocab[vocab_size].word, word);
  vocab[vocab_size].cn = 0;
  vocab_size++;
  // Reallocate memory if needed
  if (vocab_size + 2 >= vocab_max_size) {
    vocab_max_size += 1000;
    vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
  }
  hash = GetWordHash(word);
  while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
  vocab_hash[hash] = vocab_size - 1;
  return vocab_size - 1;
}

// Used later for sorting by word counts
int VocabCompare(const void *a, const void *b) {
    return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
}

// Sorts the vocabulary by frequency using word counts
void SortVocab() {
  int a, size;
  unsigned int hash;
  // Sort the vocabulary and keep </s> at the first position
  qsort(&vocab[0], vocab_size, sizeof(struct vocab_word), VocabCompare);
  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
  size = vocab_size;
  train_words = 0;
  for (a = 0; a < size; a++) {
    // Words occuring less than min_count times will be discarded from the vocab
    if ((vocab[a].cn < min_count) && (a != 0)) {
      vocab_size--;
      free(vocab[a].word);
    } else {
      // Hash will be re-computed, as after the sorting it is not actual
      hash=GetWordHash(vocab[a].word);
      while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
      vocab_hash[hash] = a;
      train_words += vocab[a].cn;
    }
  }
  vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
}

// Reduces the vocabulary by removing infrequent tokens
void ReduceVocab() {
  int a, b = 0;
  unsigned int hash;
  for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
    vocab[b].cn = vocab[a].cn;
    vocab[b].word = vocab[a].word;
    b++;
  } else free(vocab[a].word);
  vocab_size = b;
  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
  for (a = 0; a < vocab_size; a++) {
    // Hash will be re-computed, as it is not actual
    hash = GetWordHash(vocab[a].word);
    while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
    vocab_hash[hash] = a;
  }
  fflush(stdout);
  min_reduce++;
}

void AddWordsInDoc() {
  long long a, b, c, i;
  char word[MAX_STRING];

  for (a = 0; a < train_docs; a++) {
    if ((debug_mode > 1) && (a % 100000 == 0)) {
      printf("%lldK%c docs", a / 1000, 13);
      fflush(stdout);
    }
    doc_list[a].word_list = (long long *)calloc(doc_list[a].length, sizeof(long long));
    doc_list[a].length = 0;
    b = 0;
    c = 0;
    char* sentence = doc_list[a].content;
    // printf("%s\n", strlen(sentence));
    //printf("%s\n", sentence);
    while (sentence[b] != 0) {
      if (sentence[b] == ' ') {
        if (c > 0) {
          word[c] = 0;
          i = SearchVocab(word);
          if (i != -1) {
            doc_list[a].word_list[doc_list[a].length] = i;
            doc_list[a].length++;
          }
          if (doc_list[a].length >= MAX_SENTENCE_LENGTH-1) {
            printf("%lld\n", a);
            break;
          }
          c = 0;
          continue;
        } else {
          b++;
          continue;
        }
      }
      word[c] = sentence[b];
      b++;
      c++;
      if (c >= MAX_STRING -1)
        c--;
    }
    //doc_list[a].word_list = (long long *)realloc(doc_list[a].word_list, (doc_list[a].length + 1) * sizeof(long long));
  }
}

void LearnVocabFromTrainFile() {
  char word[MAX_STRING]; // it clean gurbage itself.
  char sentence[MAX_SENTENCE_LENGTH+1]; // it clean gurbage itself.
  FILE *fin;
  long long a, b, c, i;

  for (a = 0; a < vocab_hash_size; a++)
    vocab_hash[a] = -1;

  fin = fopen(train_file, "rb");
  if (fin == NULL) {
    printf("ERROR: training data file not found!\n");
    exit(1);
  }
  vocab_size = 0;

  while (1) {
    ReadSentence(sentence, fin);
    if (feof(fin)) break;
    unsigned int length = strlen(sentence) + 1;
    doc_list[train_docs].content = (char *)calloc(length+1, sizeof(char));
    doc_list[train_docs].length = 0;
    strcpy(doc_list[train_docs].content, sentence);
    b = 0;
    c = 0;

    while (1) {
      //printf("INSICEWHILE\n\n");
      if (sentence[b] == ' ' || sentence[b] == 0) {
        word[c] = 0;
        train_words++;
        doc_list[train_docs].length++;
        if ((debug_mode > 1) && (train_words % 100000 == 0)) {
          printf("%lldK%c", train_words / 1000, 13);
          fflush(stdout);
        }
        //printf("Check_Word-Array:%s\n\n", word);
        i = SearchVocab(word);
        if (i == -1) {
          a = AddWordToVocab(word);
          vocab[a].cn = 1;
        } else {
          vocab[i].cn++;
        }
        c = 0;
        if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
        if (sentence[b] == 0)
          break;
        b++;
        continue;
      }
      word[c] = sentence[b];
      b++;
      c++;
      if (c >= MAX_STRING-1) c--;
    }

    train_docs++;
    if (train_docs > MAX_DOCUMENTS) {
      printf("ERROR: number of documents exceeds!");
      exit(1);
    }
  }
  SortVocab();
  AddWordsInDoc();
  if (debug_mode > 0) {
    printf("Vocab size: %lld\n", vocab_size);
    printf("Words in train file: %lld\n", train_words);
    printf("Documents size: %lld\n", train_docs);
  }

  file_size = ftell(fin);
  free(doc_list[train_docs].content);
  fclose(fin);
}

void SaveVocab() {
  long long i;
  FILE *fvoc = fopen(save_vocab_file, "wb");
  for (i = 0; i < vocab_size; i++) fprintf(fvoc, "%s %lld\n", vocab[i].word, vocab[i].cn);
  fclose(fvoc);
}

void ReadVocab() {
  long long a, i = 0;
  //char c;
  char word[MAX_STRING];
  FILE *fin = fopen(read_vocab_file, "rb");
  if (fin == NULL) {
    printf("Vocabulary file not found\n");
    exit(1);
  }
  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
  vocab_size = 0;
  while (1) {
    ReadWord(word, fin);
    if (feof(fin)) break;
    a = AddWordToVocab(word);
    i++;
  }
  SortVocab();
  if (debug_mode > 0) {
    printf("Vocab size: %lld\n", vocab_size);
    printf("Words in train file: %lld\n", train_words);
  }
  fin = fopen(train_file, "rb");
  if (fin == NULL) {
    printf("ERROR: training data file not found!\n");
    exit(1);
  }
  fseek(fin, 0, SEEK_END);
  file_size = ftell(fin);
  fclose(fin);
}

//Initializaiton of the network
void InitNet() {
  long long a, b, c;
  unsigned long long next_random = 1;
  a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * num_topics * layer1_size * sizeof(real));
  if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
  a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * num_topics * layer1_size * sizeof(real));
  if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
  for (a = 0; a < vocab_size; a++)
    for (b = 0; b < num_topics; b++)
      for (c = 0; c < layer1_size; c++) {
        next_random = next_random * (unsigned long long)25214903917 + 11;
        syn1neg[a * num_topics * layer1_size + b * layer1_size + c] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
      }


  for (a = 0; a < vocab_size; a++)
    for (b = 0; b < num_topics; b++)
      for (c = 0; c < layer1_size; c++) {
        next_random = next_random * (unsigned long long)25214903917 + 11;
        syn0[a * num_topics * layer1_size + b * num_topics + c] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
  }

  // Init plsa arguments
  a = posix_memalign((void **)&p_z_d, 128, (long long)train_docs * num_topics * sizeof(real));
  for (a = 0; a < train_docs; a++) {
    real sum = 0.0;
    for (b = 0; b < num_topics; b++) {
        next_random = next_random * (unsigned long long)25214903917 + 11;
        p_z_d[a * num_topics + b] = ((next_random & 0xFFFF) / (real)65536);
        sum += p_z_d[a * num_topics + b];
    }

    sum += 0.00000001;
    for (b = 0; b < num_topics; b++)
      p_z_d[a * num_topics + b] /= sum;
  }
  if (debug_mode > 1)
    printf("Memory allocation complete\n");
}



void *TrainModelThread(void *id) {
  printf("the %lld th thread start...", (long long)id);
  long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0;
  long long word_count = 0, last_word_count = 0, doc_count = 0, doc_index = 0, sen_word_hash[MAX_SENTENCE_LENGTH + 1];
  long long t; 
  long long sen_sample[MAX_SENTENCE_SAMPLE];
  long long l1, l2, c, k, kt, ktj, target, label, local_out_iter = out_iter, local_in_iter = in_iter;
  unsigned long long next_random = (long long)id;
  real f, g;
  real w; 
  clock_t now;
  real latent_dis[num_topics * num_topics];
  real doc_topic[num_topics];
  long long n[negative + 1];
  real *neu1e = (real *)calloc(num_topics * layer1_size, sizeof(real));
  real *neu1 = (real *)calloc(num_topics * layer1_size, sizeof(real));

  doc_index = train_docs / (long long)num_threads * (long long) id;
	while (1) {
				if (word_count - last_word_count > 10000) {
				  word_count_actual += word_count - last_word_count;
				  last_word_count = word_count;
				  if ((debug_mode > 1)) {
					now=clock();
					printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
					 word_count_actual / (real)(out_iter * train_words + 1) * 100,
					 word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
					fflush(stdout);
				  }
				  alpha = starting_alpha * (1 - word_count_actual / (real)(out_iter * train_words + 1));
				  if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
				}

				if (doc_count > (real)train_docs / num_threads || doc_index >= train_docs) {
				  local_out_iter--;
				  if (local_out_iter == 0)
					break;
				  doc_count = 0;
				  word_count = 0;
				  last_word_count = 0;
				  sentence_length = 0;
				  doc_index = train_docs / (long long) num_threads * (long long) id;
				  continue;
				}

				if (sentence_length == 0) {
					b = 0;
					while (1) {
		
						if (b == doc_list[doc_index].length)
							break;
						word = doc_list[doc_index].word_list[b];
						b++;
						if (word == -1)
							continue;
						word_count++;
						if (sample > 0) {
							real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
							next_random = next_random * (unsigned long long)25214903917 + 11;
							if (ran < (next_random & 0xFFFF) / (real)65536) continue;
						}
						sen_word_hash[sentence_length] = word;
						sentence_length++;
						if (sentence_length > MAX_SENTENCE_LENGTH-1) {
							break;
						}
					}
					local_in_iter = in_iter;
				}
				doc_count++;
		
				while (1) {
					local_in_iter--;
					if (local_in_iter == -1)
						break;
					sentence_position = 0;
					int doc_word_count = 0;
					for (k = 0; k < num_topics; k++)
						doc_topic[k] = 0;
					while (sentence_position < sentence_length) {
						word = sen_word_hash[sentence_position];
						if (word == -1) {
							sentence_position++;
							continue;
						}
						for (c = 0; c < num_topics * layer1_size; c++) neu1e[c] = 0;
					
						for (c = 0; c < num_topics * layer1_size; c++) neu1[c] = 0;
						next_random = next_random * (unsigned long long)25214903917 + 11;
						b = next_random % window;

						for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
							c = sentence_position - window + a;
							if (c < 0) continue;
							if (c >= sentence_length) continue;
							last_word = sen_word_hash[c];
							if (last_word == -1) continue;
							l1 = last_word * num_topics * layer1_size;
							for (c = 0; c < num_topics * layer1_size; c++) neu1e[c] = 0;
							
							for (c = 0; c < num_topics * layer1_size; c++) neu1[c] = 0;

							doc_word_count++;

						
							for (kt = 0; kt < num_topics; kt++) {
								for (c = 0; c < layer1_size; c++) {
									neu1[kt * layer1_size + c] = syn0[c + l1 + kt * layer1_size];
								}
							}

							
							for (c = 0; c < MAX_SENTENCE_SAMPLE; c ++) { sen_sample[c] = -1; }
							int already_sampled = 0;
							for (t = 0; t < sentence_length; t ++) {
								next_random = next_random * (unsigned long long)25214903917 + 11;
								if ((next_random & 0xFFFF) / (real)65536 > rp_sample) continue;
								if (t == sentence_position) continue;
								last_word = sen_word_hash[t];
								if (last_word == -1) continue;
								sen_sample[already_sampled] = last_word;
								already_sampled ++;
								if (already_sampled >= MAX_SENTENCE_SAMPLE) break;
							}
							w = 1.0 / rp_sample / sentence_length;

								for (t = 0; t < already_sampled; t ++) {
                 
									l1 = sen_sample[t] * num_topics * layer1_size;
									for (kt = 0; kt < num_topics; kt++) { 
										
										for (c = 0; c < layer1_size; c++) {
											
											neu1[kt * layer1_size + c] += w * syn0[c + l1 + kt * layer1_size];
										}
									}

								}

							for (c = 0; c < negative + 1; c++) {
							  if (c == 0) {
								n[c] = word;
							  } else {
								  next_random = next_random * (unsigned long long)25214903917 + 11;
								  target = table[(next_random >> 16) % table_size];
								  if (target == 0) target = next_random % (vocab_size - 1) + 1;
								  if (target == word) {
									c--;
									continue;
								  } else {
									n[c] = target;
								  }
							  }
							  
							}

							// E-step
							real sum = 0;
							for (kt = 0; kt < num_topics; kt++) {
								for (ktj = 0; ktj < num_topics; ktj++) {
									real tv = 0;
									for (d = 0; d < negative + 1; d++) {
										f = 0;
										for (c = 0; c < layer1_size; c++) {
											
											f += neu1[kt * layer1_size + c] * syn1neg[c + n[d] * num_topics * layer1_size + ktj * layer1_size];
										}
										if (f <= -MAX_EXP)
											f = -MAX_EXP;
										if (f >= MAX_EXP)
											f = MAX_EXP;
										if (d == 0) {
											tv += log(expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
											
										}
										else {
											tv += log(expTable[(int)((f * (-1) + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]);
										}
									}
									latent_dis[kt * num_topics + ktj] = p_z_d[doc_index * num_topics + kt] * p_z_d[doc_index * num_topics + ktj] * exp(tv);
									
									sum += latent_dis[kt * num_topics + ktj];
								}
							}
						
							for (kt = 0; kt < num_topics; kt++) {
								for (ktj = 0; ktj < num_topics; ktj++) {
									latent_dis[kt * num_topics + ktj] /= (sum + 0.0000001);
									doc_topic[kt] += latent_dis[kt * num_topics + ktj];
									doc_topic[ktj] += latent_dis[kt * num_topics + ktj];
								}
							}
							
							if (negative > 0)
								for (d = 0; d < negative + 1; d++) {
									target = n[d];
									if (d == 0) {
										label = 1;
									} else {
										next_random = next_random * (unsigned long long)25214903917 + 11;
										target = table[(next_random >> 16) % table_size];
										if (target == 0) target = next_random % (vocab_size - 1) + 1;
										if (target == word) continue;
										label = 0;
										label = 0;
									}
									l2 = target * num_topics * layer1_size;

									for (kt = 0; kt < num_topics; kt++) {
										for (ktj = 0; ktj < num_topics; ktj++) {
											f = 0;
											for (c = 0; c < layer1_size; c++) {
											   f += neu1[kt * layer1_size + c] * syn1neg[c + l2 + ktj * layer1_size];
											 
											}
											if (f >= MAX_EXP) {
											  g = (label - 1) * alpha * latent_dis[kt * num_topics + ktj];
											} else if (f <= -MAX_EXP) {
													g = (label - 0) * alpha * latent_dis[kt * num_topics + ktj];
											} else {
												g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * latent_dis[kt * num_topics + ktj];
											}
											for (c = 0; c < layer1_size; c++) {
												neu1e[kt * layer1_size + c] += g * syn1neg[c + ktj * layer1_size + l2];
											}
											for (c = 0; c < layer1_size; c++) {
											  syn1neg[c + ktj * layer1_size + l2] += g * neu1[kt * layer1_size + c];
											}
										}
									}
								}
							
							for (c = 0; c < num_topics * layer1_size; c++){
								
                syn0[c + l1] += neu1e[c];
								
							}

							w = 1.0 / rp_sample / sentence_length;
							for (t = 0; t < already_sampled; t ++) {

								l1 = sen_sample[t] * num_topics * layer1_size;
								for (c = 0; c < num_topics * layer1_size; c ++)
									
                  syn0[c + l1] += neu1e[c] * w;
							}

						}
						sentence_position++;
					}

					for (k = 0; k < num_topics; k++) {
						doc_topic[k] /= (real) doc_word_count * 2 + 0.000001;
						p_z_d[doc_index * num_topics + k] = doc_topic[k];
					}
				}
				sentence_length = 0;
				doc_index++;
	}
	pthread_exit(NULL);
}

void TrainModel() {
  long a, b, c, k;
  FILE *fo;
  FILE *bin_fo;
  FILE *fo2;
  FILE *test_fo;
  pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
  printf("Starting training using file %s\n", train_file);
  starting_alpha = alpha;
  LearnVocabFromTrainFile();
  if(save_vocab_file[0] != 0) SaveVocab();
  if (output_file[0] == 0) return;
  if (test_output_file[0] == 0) return;
  InitNet();
  if (negative > 0) InitUnigramTable();
  start = clock();
  printf("start create thread...\n");
  for (a = 0; a < num_threads; a++) {
    printf("start create %ld thread ...\n", a);
    pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
  }

  for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
  char output_file_name[MAX_STRING];
  char output_file_name2[MAX_STRING];
  //char output_binary_file_name[MAX_STRING];
  sprintf(output_file_name, "%s_s%lld_n%d_w%d_b%d_oit%lld_init%lld_k%lld_count%d_sample%f_input.txt", output_file, layer1_size, negative, window, binary, out_iter, in_iter, num_topics, min_count, sample);
  sprintf(output_file_name2, "%s_s%lld_n%d_w%d_b%d_oit%lld_init%lld_k%lld_count%d_sample%f_output.txt", output_file, layer1_size, negative, window, binary, out_iter, in_iter, num_topics, min_count, sample);
  fo = fopen(output_file_name, "wb");
  fo2 = fopen(output_file_name2, "wb");

  for (a = 0; a < vocab_size; a++) {
    for (b = 0; b < (long) num_topics; b++) {
      fprintf(fo, "%s#%ld ", vocab[a].word, b);
      fprintf(fo2, "%s#%ld ", vocab[a].word, b);
              if (binary) {
          for (c = 0; c < layer1_size; c++)
            fwrite(&syn0[a * num_topics * layer1_size + b * layer1_size + c], sizeof(real), 1, fo);
        } else {
          for (c = 0; c < layer1_size; c++) {

                        fprintf(fo, "%lf ", syn0[a * num_topics * layer1_size + b * layer1_size + c]);
            fprintf(fo2, "%lf ", syn1neg[a * num_topics * layer1_size + b * layer1_size + c]);
          }
        }
        if (binary) {
          fprintf(bin_fo, "\n");
        }else{
          fprintf(fo, "\n");
          fprintf(fo2, "\n");
        }
    }
  }
  fclose(fo);
  fclose(fo2);
  long long t, sentence_length = 0, s , cr, i = -1;
   long long l1;
  unsigned long long next_random = (long long) num_threads;
  real w;
  real *neu1 = (real*)calloc(num_topics * layer1_size, sizeof(real));
  printf("writing sentence vector ...\n");
  FILE *fi = fopen(test_file, "rb");
  test_fo = fopen(test_output_file, "wb");
  char word[MAX_STRING];
  long long sen[MAX_SENTENCE_LENGTH + 1];
  char sentence[MAX_SENTENCE_LENGTH+1];
  while (1) {
    sentence_length = 0;
    ReadSentence(sentence, fi);
    if (feof(fi)) break;
    s = 0;
    cr = 0;
    while (1) {
      if (s > strlen(sentence)) break;
      if (sentence[s] == ' ' || sentence[s] == 0) {
        word[cr] = 0;
        i = SearchVocab(word);
        if (i == -1) {
          cr = 0;
          s++;
          continue;
        } else {

          for (c = 0; c < num_topics * layer1_size; c ++) { neu1[c] = 0; }
          if (sample > 0) {
      			real ran = (sqrt(vocab[i].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[i].cn;
      			next_random = next_random * (unsigned long long)25214903917 + 11;
      			if (ran < (next_random & 0xFFFF) / (real)65536) continue;
      		}
      		if (sentence_length < MAX_SENTENCE_LENGTH) {
              		sen[sentence_length] = i;
              		sentence_length++;
      		}

      	}

        cr = 0;
        if (sentence[s] == 0)
          break;
        s++;
        continue;
      }
      word[cr] = sentence[s];
      s++;
      cr++;
      if (cr >= MAX_STRING-1) cr--;
    }

    for (t = 0; t < sentence_length; t ++ ) {
      if (sen[t] == -1) continue;
      w = 1.0/sentence_length;
      l1 = sen[t] * num_topics * layer1_size;
      for (k = 0; k < num_topics; k++) {
        for (c = 0; c < layer1_size; c ++){
          neu1[k * layer1_size + c] += w * syn0[c + l1 + k*layer1_size];
        }
      }

    }

    for (b = 0; b < (long) num_topics; b++){
      for (c = 0; c < layer1_size; c++) {
        fprintf(test_fo, "%lf ", neu1[b * layer1_size + c]);
      }
    }
    fprintf(test_fo, "\n");

  }

    fclose(fi);
    fclose(test_fo);
}


int ArgPos(char *str, int argc, char **argv) {
  int a;
  for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
    if (a == argc - 1) {
      printf("Argument missing for %s\n", str);
      exit(1);
    }
    return a;
  }
  return -1;
}

int main(int argc, char **argv) {
  int i;
  if (argc == 1) {
    printf("PLSA WORD VECTOR estimation toolkit v 0.1c\n\n");
    printf("Options:\n");
    printf("Parameters for training:\n");
    printf("\t-train <file>\n");
    printf("\t\tUse text data from <file> to train the model\n");
    printf("\t-output <file>\n");
    printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
    printf("\t-size <int>\n");
    printf("\t\tSet size of word vectors; default is 100\n");
    printf("\t-window <int>\n");
    printf("\t\tSet max skip length between words; default is 5\n");
    printf("\t-sample <float>\n");
    printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
    printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
    printf("\t-negative <int>\n");
    printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
    printf("\t-threads <int>\n");
    printf("\t\tUse <int> threads (default 12)\n");
    printf("\t-K <int>\n");
    printf("\t\tNumber of topics(default 10)\n");
    printf("\t-iter <int>\n");
    printf("\t\tRun more training iterations (default 5)\n");
    printf("\t-min-count <int>\n");
    printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
    printf("\t-alpha <float>\n");
    printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram\n");
    printf("\t-debug <int>\n");
    printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
    printf("\t-binary <int>\n");
    printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
    printf("\t-save-vocab <file>\n");
    printf("\t\tThe vocabulary will be saved to <file>\n");
    printf("\t-read-vocab <file>\n");
    printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
    printf("\nExamples:\n");
    printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -binary 0 -iter 3\n\n");
    return 0;
  }
  test_output_file[0] = 0;
  output_file[0] = 0;
  save_vocab_file[0] = 0;
  read_vocab_file[0] = 0;
  if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-test", argc, argv)) > 0) strcpy(test_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-test-output", argc, argv)) > 0) strcpy(test_output_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-sentence-sample", argc, argv)) > 0) rp_sample = atof(argv[i+1]);
  if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-K", argc, argv)) > 0) num_topics = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-out_iter", argc, argv)) > 0) out_iter = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-in_iter", argc, argv)) > 0) in_iter = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
  vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
  vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
  expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
  doc_list = (struct doc*)calloc(MAX_DOCUMENTS, sizeof(struct doc));
  for (i = 0; i <= EXP_TABLE_SIZE; i++) {
    expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
    expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
  }
  TrainModel();
  return 0;
}
