import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk


nltk.download('punkt')


legal_corpus = [
    "The plaintiff filed a motion in court",
    "The defendant was found guilty of negligence",
    "The contract was deemed void due to misrepresentation",
    "The lawyer presented evidence before the judge",
    "The court issued a legal notice to the defendant"
]


tokenized_corpus = [word_tokenize(doc.lower()) for doc in legal_corpus]


model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    epochs=50
)


model.save("legal_word2vec.model")

print("Model Trained Successfully")


print("\nWords similar to 'plaintiff':")
for word, score in model.wv.most_similar("plaintiff"):
    print(f"{word}: {score:.4f}")


print("\nWords similar to 'contract':")
for word, score in model.wv.most_similar("contract"):
    print(f"{word}: {score:.4f}")


similarity = model.wv.similarity("plaintiff", "defendant")
print(f"\nSimilarity between 'plaintiff' and 'defendant': {similarity:.4f}")