/*
 * The MIT License (MIT)

 * Copyright (c) 2025 GenText-Checker Developers

 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:

 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#ifndef GTCHECKER_SIMILARITY_H_
#define GTCHECKER_SIMILARITY_H_

#include "tokenizer.h"

#include <string>
#include <vector>
#include <iostream>
#include <cstdlib>
#include <unordered_map>
#include <cmath>

namespace gtchecker {

class Similarity {
 public:
  Similarity(const std::vector<gtchecker::Sentence>& chunks_A,
             const std::vector<gtchecker::Sentence>& chunks_B,
             const std::string& type) 
    : chunks_A_(chunks_A), 
      chunks_B_(chunks_B), 
      type_(type) { }

  std::vector<std::vector<double>> Calculate() {
    if (type_ == "--strict") {
      return strict_similarity();
    } else if (type_ == "--loose") {
      return loose_similarity();
    } else {
      std::cout << "Unknow Similarity type: " << type_ << std::endl;
      exit(EXIT_FAILURE);
    }
  }

 private:
  std::vector<std::vector<double>> strict_similarity() {
    std::vector<std::vector<double>> results;
    for (int i = 0; i < chunks_A_.size(); ++i) {
      std::vector<double> tmp;
      for (int j = 0; j < chunks_B_.size(); ++j) {
        double similarity;
        if (cosine_similarity(chunks_A_[i].sentence(), 
                              chunks_B_[j].sentence()) > 0.7) {
          similarity = 1.0;
        } else {
          similarity = 0.0;
        }
        tmp.push_back(similarity);
      }
      results.push_back(tmp);
    }
    return results;
  }

  std::vector<std::vector<double>> loose_similarity() {
    std::vector<std::vector<double>> results;
    for (int i = 0; i < chunks_A_.size(); ++i) {
      std::vector<double> tmp;
      for (int j = 0; j < chunks_B_.size(); ++j) {
        double similarity;
        if (chunks_A_[i].sentence() == chunks_B_[j].sentence()) {
          similarity = 1.0;
        } else {
          similarity = 0.0;
        }
        tmp.push_back(similarity);
      }
      results.push_back(tmp);
    }
    return results;
  }

  double cosine_similarity(const std::string& str_A, 
                           const std::string& str_B) {
    gtchecker::Tokenizer tokenizer;
    std::vector<std::string> A_words = tokenizer.GetSplitWords(str_A);
    std::vector<std::string> B_words = tokenizer.GetSplitWords(str_B);

    std::vector<int> vector_A;
    std::vector<int> vector_B;

    vectorize(A_words, B_words, &vector_A, &vector_B);

    double similarity = dot(vector_A, vector_B) / (norm(vector_A) * norm(vector_B));

    return similarity;
  }

  void vectorize(const std::vector<std::string>& document_A,
                 const std::vector<std::string>& document_B,
                 std::vector<int>* vector_A,
                 std::vector<int>* vector_B) {
    
    std::unordered_map<std::string, int> word_map;
    std::unordered_map<std::string, int> A_map;
    std::unordered_map<std::string, int> B_map;

    for (int i = 0; i < document_A.size(); ++i) {
      word_map[document_A[i]]++;
      A_map[document_A[i]]++;
    }
    for (int i = 0; i < document_B.size(); ++i) {
      word_map[document_B[i]]++;
      B_map[document_B[i]]++;
    }

    std::unordered_map<std::string, int>::iterator it = word_map.begin();
    for (; it != word_map.end(); ++it) {
      vector_A->push_back(A_map[it->first]);
      vector_B->push_back(B_map[it->first]);
    }
  }

  double dot(const std::vector<int>& vector_A,
             const std::vector<int>& vector_B) {
    if (vector_A.size() != vector_B.size()) {
      throw std::invalid_argument("Vectors must be of the same length.");
    }

    double result = 0.0;
    for (int i = 0; i < vector_A.size(); ++i) {
      result += vector_A[i] * vector_B[i];
    }

    return result;
  }

  double norm(std::vector<int> vector_A) {
    double accumulate = 0.0;
    for (int i = 0; i < vector_A.size(); ++i) {
      accumulate += vector_A[i] * vector_A[i];
    }
    return std::sqrt(accumulate);
  }

  std::vector<gtchecker::Sentence> chunks_A_;
  std::vector<gtchecker::Sentence> chunks_B_;
  std::string type_;
};

} // namespace gtchecker

#endif // GTCHECKER_SIMILARITY_H_