Spaces:
Build error
Build error
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline | |
| from sentence_transformers import SentenceTransformer, util | |
| import random | |
| import re | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| import warnings | |
| from transformers import logging | |
| import os | |
| import tensorflow as tf | |
| import requests | |
| os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' | |
| os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| warnings.filterwarnings("ignore", category=UserWarning) | |
| warnings.filterwarnings("ignore") | |
| logging.set_verbosity_error() | |
| tf.get_logger().setLevel('ERROR') | |
| nltk.download('punkt') | |
| GROQ_API_KEY="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" | |
| class TextEnhancer: | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(self.device) | |
| self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5") | |
| self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device) | |
| print("paraphraser loaded") | |
| self.grammar_pipeline = pipeline( | |
| "text2text-generation", | |
| model="Grammarly/coedit-large", | |
| device=0 if self.device == "cuda" else -1 | |
| ) | |
| print("grammar check loaded") | |
| self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device) | |
| print("sementics model loaded") | |
| def _evaluate_with_groq(self, passage=""): | |
| if not passage: | |
| raise ValueError("Input passage cannot be empty.") | |
| # Groq API setup | |
| headers = { | |
| "Authorization": f"Bearer {GROQ_API_KEY}", # Replace GROQ_API_KEY with your actual API key. | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "model": "llama3-70b-8192", | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "Paraphrase this sentence to better suit it as an introductory sentence to a student's Statement of purpose. Ensure that the vocabulary and grammar is upto par. ONLY return the raw paraphrased sentence and nothing else.IF IT IS a empty string, return empty string " | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"Here is the passage: {passage}" | |
| } | |
| ], | |
| "temperature": 1.0, | |
| "max_tokens": 8192 | |
| } | |
| # Sending request to Groq API | |
| print("Sending request to Groq API...") | |
| response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers) | |
| print("Response received.") | |
| # Handling the response | |
| if response.status_code == 200: | |
| data = response.json() | |
| try: | |
| segmented_text = data.get("choices", [{}])[0].get("message", {}).get("content", "") | |
| print("sentence paraphrase processed successfully.") | |
| print(segmented_text) | |
| return segmented_text | |
| except (IndexError, KeyError) as e: | |
| raise ValueError(f"Unexpected response structure from Groq API. Error: {str(e)}") | |
| else: | |
| raise ValueError(f"Groq API error: {response.status_code}, {response.text}") | |
| def _correct_formatting(self, sentence): | |
| cleaned_sentence = re.sub(r'([.,!?])\1+', r'\1', sentence) | |
| cleaned_sentence = cleaned_sentence.strip() | |
| return cleaned_sentence | |
| def enhance_text(self, text, min_similarity=0.8, max_variations=3): | |
| sent=0 | |
| enhanced_sentences = [] | |
| sentences = sent_tokenize(text) | |
| total_words = sum(len(sentence.split()) for sentence in sentences) | |
| print(f"generated: {total_words}") | |
| for sentence in sentences: | |
| if not sentence.strip(): | |
| continue | |
| sent+=1 | |
| inputs = self.paraphrase_tokenizer( | |
| f"paraphrase: {sentence}", | |
| return_tensors="pt", | |
| padding=True, | |
| max_length=150, | |
| truncation=True | |
| ).to(self.device) | |
| outputs = self.paraphrase_model.generate( | |
| **inputs, | |
| max_length=len(sentence.split()) + 20, | |
| num_return_sequences=max_variations, | |
| num_beams=max_variations, | |
| temperature=0.7 | |
| ) | |
| paraphrases = [ | |
| self.paraphrase_tokenizer.decode(output, skip_special_tokens=True) | |
| for output in outputs | |
| ] | |
| sentence_embedding = self.similarity_model.encode(sentence) | |
| paraphrase_embeddings = self.similarity_model.encode(paraphrases) | |
| similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings) | |
| valid_paraphrases = [ | |
| para for para, sim in zip(paraphrases, similarities[0]) | |
| if sim >= min_similarity | |
| ] | |
| if sent in {1, len(sentences)} and valid_paraphrases: | |
| gemini_feedback = self._evaluate_with_groq(valid_paraphrases[0]) | |
| if gemini_feedback.strip(): | |
| valid_paraphrases[0] = gemini_feedback.strip() | |
| if valid_paraphrases: | |
| corrected = self.grammar_pipeline( | |
| valid_paraphrases[0], | |
| max_length=150, | |
| num_return_sequences=1 | |
| )[0]["generated_text"] | |
| corrected = self._humanize_text(corrected) | |
| corrected=self._correct_formatting(corrected) | |
| enhanced_sentences.append(corrected) | |
| else: | |
| sentence=self._correct_formatting(sentence) | |
| enhanced_sentences.append(sentence) | |
| enhanced_text = ". ".join(sentence.rstrip(".") for sentence in enhanced_sentences) + "." | |
| return enhanced_text | |
| def _humanize_text(self, text): | |
| contractions = {"can't": "cannot", "won't": "will not", "I'm": "I am", "it's": "it is"} | |
| words = text.split() | |
| text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words]) | |
| if random.random() > 0.7: | |
| text = text.replace(" and ", ", and ") | |
| # Minor variations in sentence structure | |
| if random.random() > 0.5: | |
| text = text.replace(" is ", " happens to be ") | |
| return text | |
| def create_interface(): | |
| enhancer = TextEnhancer() | |
| def process_text(text, similarity_threshold=0.75): | |
| try: | |
| enhanced = enhancer.enhance_text( | |
| text, | |
| min_similarity=similarity_threshold / 100, | |
| max_variations=10 | |
| ) | |
| print("grammar enhanced") | |
| return enhanced | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| interface = gr.Blocks() | |
| with interface: | |
| with gr.Row(elem_id="header", variant="panel"): | |
| gr.HTML(""" | |
| <div style="display: flex; align-items: center; justify-content: center; gap: 10px; margin-bottom: 20px;"> | |
| <img src="https://raw.githubusercontent.com/juicjaane/blueai/main/logo_2.jpg" style="width: 50px; height: 50px;"> | |
| <h1 style="color: gold; font-size: 2em; margin: 0;">Konect U</h1> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Your SoP") | |
| input_text = gr.Textbox(label="Input", placeholder="Enter SoP to Paraphrase...", lines=10) | |
| submit_button = gr.Button("Paraphrase") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Paraphrased SoP") | |
| enhanced_text = gr.Textbox(label="SoP", lines=10) | |
| submit_button.click(process_text, inputs=[input_text], outputs=enhanced_text) | |
| return interface | |
| if __name__ == "__main__": | |
| interface = create_interface() | |
| interface.launch(share=True) | |