data_analysis / app.py
mahedi420's picture
update
82f1106 verified
import os
import pandas as pd
import google.generativeai as genai
## Configure Genai Key
from dotenv import load_dotenv
## load all the environemnt variables
load_dotenv()
import matplotlib.pyplot as plt
import streamlit as st
from pytrie import SortedStringTrie
import sys
from io import StringIO
output_stream = StringIO()
sys.stdout = output_stream
## Configure Genai Key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
# build auto complete model
class TextSuggestionTool:
def __init__(self):
self.trie = SortedStringTrie()
def add_word(self, word):
if word not in self.trie:
self.trie[word] = True
def build_from_corpus(self, corpus):
for word in corpus:
sub_word = ""
for letter in word:
sub_word += letter
self.add_word(sub_word)
def auto_words(self, prefix):
suggestions = []
for key, value in self.trie.items(prefix=prefix):
suggestions.append(key)
return suggestions
## gernerate response of the user query
def get_gemini_response(question,prompt):
model=genai.GenerativeModel('gemini-pro')
response=model.generate_content([prompt,question])
return response.text
## Streamlit App
df= pd.DataFrame()
suggestion_tool = TextSuggestionTool()
string_of_csv = []
auto_complete_corpus = []
st.set_page_config(page_title="I can Retrieve Any SQL query")
st.header("Data analysis")
# upload data file
uploaded_file = st.file_uploader("Upload data file", type=["csv"], key="uploaded_file")
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
df = df.dropna(axis=1, how="all")
df = df.dropna(axis=0, how="all")
df = df.fillna('nan')
df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
object_columns = df.select_dtypes(include='object')
lists_of_columns = [df[column].tolist() for column in object_columns.columns]
string_of_csv = [element for sublist in lists_of_columns for element in sublist]
for element in string_of_csv:
auto_complete_corpus.append(str(element))
df.columns = df.columns.str.lower()
df.columns = df.columns.str.strip()
# all columns append in list
column_names = df.columns.tolist()
column = ', '.join(column_names)
columns_and_types = dict(zip(df.columns, df.dtypes))
columns_and_types = ', '.join(columns_and_types)
# buid auto complete data structure
suggestion_tool.build_from_corpus(auto_complete_corpus)
# show the dataframe columns
st.text(column)
#dashboard split into two part
col1, col2 = st.columns([1,1])
with col1:
st.subheader("User Input for data extraction")
question1=st.text_input("Input: ",key="input_extraction")
# generated suggest word
autocomplete1 = suggestion_tool.auto_words(question1.lower())
autocomplete1.reverse()
# show suggested word
st.subheader("Suggessions")
if autocomplete1:
for word in autocomplete1[:5]:
st.write(word)
else:
st.write("No suggestions found.")
# submit button 1
submit1=st.button("Data Extract")
with col2:
st.subheader("User Input for data visualization")
question2=st.text_input("Input: ",key="input_visualization")
# generated suggest word
autocomplete2 = suggestion_tool.auto_words(question2.lower())
# show suggested word
st.subheader("Suggessions")
autocomplete2.reverse()
if autocomplete2:
for word in autocomplete2[:5]:
st.write(word)
else:
st.write("No suggestions found.")
#submit button 2
submit2=st.button("Data Data Visualization")
if submit1:
prompt1 = f"""
You are an expert in generating pandas code from a pandas dataframe to user question.
now your task is to generate pandas code from given dataframe based on english questions without any variable name.
The given pandas DataFrame name is df and the columns are {column},
now generate pandas code from the df dataframe based on englsih question
please add print all the time
For example :
For example 1 :
english query : how many rows ?
python code : print(len(df.index))
For example 2 :
english query : how many columns ?
python code : print(len(df.columns))
"""
# get response from model
response=get_gemini_response(question1,prompt1)
response = response.replace('`','')
response = response.replace('python','')
st.subheader("Statiscal analysis of data")
# st.text(response)
st.set_option('deprecation.showPyplotGlobalUse', False)
# show the plot on the user side
try:
exec(response, globals(), locals())
result = output_stream.getvalue()
st.code(result)
except Exception as e:
st.write("Error:", e)
finally:
# Reset sys.stdout to its original value
sys.stdout = sys.__stdout__
if submit2:
prompt2 = f"""
You are an expert in generating python code for statistical chart like bar, pie, histogram, scatter from a pandas dataframe.
now your task is to generate python code for statistical chart from given dataframe based on english questions.
The given pandas DataFrame name is df and the columns are {column} and columns data types are {columns_and_types}
now generate Python code from the df dataframe based on englsih question
For example :
For example 1 :
english query : generate code for pie chart of gender distribution
python code : gender_counts = df['gender'].value_counts()
# Plotting a Pie Chart
plt.figure(figsize=(8, 8))
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
plt.title('Distribution of Gender Among Students')
plt.show()
For example 2 :
english query : generate python code for histogram of age based on name
python code :
# Plotting the histogram
plt.hist(df['Age'], bins=10, edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Histogram of Age based on Names')
plt.grid(True)
plt.show()
"""
# get response from model
response=get_gemini_response(question2,prompt2)
response = response.replace('`','')
response = response.replace('python','')
print(response)
st.subheader("Statiscal analysis of data")
# st.text(response)
st.set_option('deprecation.showPyplotGlobalUse', False)
# show the plot on the user side
try:
fig = exec(response)
st.pyplot(fig)
except Exception as e:
print(f"Error executing code: {e}")