Spaces:

mahedi420
/

data_analysis

Build error

App Files Files Community

data_analysis / app.py

mahedi420

update

82f1106 verified over 2 years ago

raw

history blame contribute delete

6.91 kB

	import os
	import pandas as pd
	import google.generativeai as genai
	## Configure Genai Key
	from dotenv import load_dotenv
	## load all the environemnt variables
	load_dotenv()
	import matplotlib.pyplot as plt
	import streamlit as st
	from pytrie import SortedStringTrie
	import sys
	from io import StringIO

	output_stream = StringIO()
	sys.stdout = output_stream


	## Configure Genai Key
	genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

	# build auto complete model
	class TextSuggestionTool:
	def __init__(self):
	self.trie = SortedStringTrie()

	def add_word(self, word):
	if word not in self.trie:
	self.trie[word] = True

	def build_from_corpus(self, corpus):
	for word in corpus:
	sub_word = ""
	for letter in word:
	sub_word += letter
	self.add_word(sub_word)

	def auto_words(self, prefix):
	suggestions = []
	for key, value in self.trie.items(prefix=prefix):
	suggestions.append(key)
	return suggestions

	## gernerate response of the user query
	def get_gemini_response(question,prompt):
	model=genai.GenerativeModel('gemini-pro')
	response=model.generate_content([prompt,question])
	return response.text


	## Streamlit App
	df= pd.DataFrame()
	suggestion_tool = TextSuggestionTool()
	string_of_csv = []
	auto_complete_corpus = []
	st.set_page_config(page_title="I can Retrieve Any SQL query")
	st.header("Data analysis")
	# upload data file
	uploaded_file = st.file_uploader("Upload data file", type=["csv"], key="uploaded_file")
	if uploaded_file is not None:
	df = pd.read_csv(uploaded_file)
	df = df.dropna(axis=1, how="all")
	df = df.dropna(axis=0, how="all")
	df = df.fillna('nan')

	df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
	object_columns = df.select_dtypes(include='object')
	lists_of_columns = [df[column].tolist() for column in object_columns.columns]
	string_of_csv = [element for sublist in lists_of_columns for element in sublist]

	for element in string_of_csv:
	auto_complete_corpus.append(str(element))

	df.columns = df.columns.str.lower()
	df.columns = df.columns.str.strip()
	# all columns append in list
	column_names = df.columns.tolist()
	column = ', '.join(column_names)
	columns_and_types = dict(zip(df.columns, df.dtypes))
	columns_and_types = ', '.join(columns_and_types)
	# buid auto complete data structure
	suggestion_tool.build_from_corpus(auto_complete_corpus)
	# show the dataframe columns
	st.text(column)
	#dashboard split into two part
	col1, col2 = st.columns([1,1])
	with col1:
	st.subheader("User Input for data extraction")
	question1=st.text_input("Input: ",key="input_extraction")
	# generated suggest word
	autocomplete1 = suggestion_tool.auto_words(question1.lower())
	autocomplete1.reverse()
	# show suggested word
	st.subheader("Suggessions")
	if autocomplete1:
	for word in autocomplete1[:5]:
	st.write(word)
	else:
	st.write("No suggestions found.")
	# submit button 1
	submit1=st.button("Data Extract")
	with col2:
	st.subheader("User Input for data visualization")
	question2=st.text_input("Input: ",key="input_visualization")
	# generated suggest word
	autocomplete2 = suggestion_tool.auto_words(question2.lower())
	# show suggested word
	st.subheader("Suggessions")
	autocomplete2.reverse()
	if autocomplete2:
	for word in autocomplete2[:5]:
	st.write(word)
	else:
	st.write("No suggestions found.")
	#submit button 2
	submit2=st.button("Data Data Visualization")

	if submit1:
	prompt1 = f"""
	You are an expert in generating pandas code from a pandas dataframe to user question.
	now your task is to generate pandas code from given dataframe based on english questions without any variable name.
	The given pandas DataFrame name is df and the columns are {column},
	now generate pandas code from the df dataframe based on englsih question
	please add print all the time

	For example :
	For example 1 :
	english query : how many rows ?
	python code : print(len(df.index))

	For example 2 :
	english query : how many columns ?
	python code : print(len(df.columns))
	"""
	# get response from model
	response=get_gemini_response(question1,prompt1)
	response = response.replace('`','')
	response = response.replace('python','')
	st.subheader("Statiscal analysis of data")
	# st.text(response)
	st.set_option('deprecation.showPyplotGlobalUse', False)
	# show the plot on the user side
	try:
	exec(response, globals(), locals())
	result = output_stream.getvalue()
	st.code(result)
	except Exception as e:
	st.write("Error:", e)
	finally:
	# Reset sys.stdout to its original value
	sys.stdout = sys.__stdout__

	if submit2:
	prompt2 = f"""
	You are an expert in generating python code for statistical chart like bar, pie, histogram, scatter from a pandas dataframe.
	now your task is to generate python code for statistical chart from given dataframe based on english questions.
	The given pandas DataFrame name is df and the columns are {column} and columns data types are {columns_and_types}
	now generate Python code from the df dataframe based on englsih question

	For example :
	For example 1 :
	english query : generate code for pie chart of gender distribution
	python code : gender_counts = df['gender'].value_counts()

	# Plotting a Pie Chart
	plt.figure(figsize=(8, 8))
	plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
	plt.title('Distribution of Gender Among Students')
	plt.show()

	For example 2 :
	english query : generate python code for histogram of age based on name
	python code :
	# Plotting the histogram
	plt.hist(df['Age'], bins=10, edgecolor='black')
	plt.xlabel('Age')
	plt.ylabel('Frequency')
	plt.title('Histogram of Age based on Names')
	plt.grid(True)
	plt.show()
	"""
	# get response from model
	response=get_gemini_response(question2,prompt2)
	response = response.replace('`','')
	response = response.replace('python','')
	print(response)
	st.subheader("Statiscal analysis of data")
	# st.text(response)
	st.set_option('deprecation.showPyplotGlobalUse', False)

	# show the plot on the user side
	try:
	fig = exec(response)
	st.pyplot(fig)
	except Exception as e:
	print(f"Error executing code: {e}")