Sunil Surendra Singh
commited on
Commit
·
769af1a
0
Parent(s):
First commit
Browse files- .gitattributes +5 -0
- .github/workflows/main.yml +19 -0
- .gitignore +139 -0
- .streamlit/config.toml +10 -0
- LICENSE +21 -0
- README.md +92 -0
- artifacts/about.md +75 -0
- artifacts/lr_model.joblib +3 -0
- assets/NLP.png +3 -0
- assets/emotion.png +3 -0
- assets/logo.png +3 -0
- assets/ner.png +3 -0
- assets/pos.png +3 -0
- assets/summ1.png +3 -0
- assets/summ2.png +3 -0
- assets/title.png +3 -0
- data/emotions.csv +0 -0
- notebooks/experiments.ipynb +481 -0
- requirements.txt +9 -0
- src/app.py +39 -0
- src/config.py +94 -0
- src/data.py +55 -0
- src/emotion_classification.py +45 -0
- src/plot.py +37 -0
- src/tab_about.py +22 -0
- src/tab_emotions.py +57 -0
- src/tab_ner.py +64 -0
- src/tab_summarization.py +83 -0
- src/utils.py +61 -0
.gitattributes
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/main.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
# to run this workflow manually from the Actions tab
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
sync-to-hub:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- uses: actions/checkout@v3
|
| 13 |
+
with:
|
| 14 |
+
fetch-depth: 0
|
| 15 |
+
lfs: true
|
| 16 |
+
- name: Push to hub
|
| 17 |
+
env:
|
| 18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
run: git push --force https://sssingh:[email protected]/spaces/sssingh/nlp-ner-summarization-classification main
|
.gitignore
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MY CHNAGES
|
| 2 |
+
scratch*.ipynb
|
| 3 |
+
venv*/
|
| 4 |
+
.vscode*
|
| 5 |
+
.vscode*/
|
| 6 |
+
.examples/
|
| 7 |
+
|
| 8 |
+
.swp
|
| 9 |
+
|
| 10 |
+
# Byte-compiled / optimized / DLL files
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
*$py.class
|
| 14 |
+
|
| 15 |
+
# C extensions
|
| 16 |
+
*.so
|
| 17 |
+
|
| 18 |
+
# Distribution / packaging
|
| 19 |
+
.Python
|
| 20 |
+
build/
|
| 21 |
+
develop-eggs/
|
| 22 |
+
dist/
|
| 23 |
+
downloads/
|
| 24 |
+
eggs/
|
| 25 |
+
.eggs/
|
| 26 |
+
lib/
|
| 27 |
+
lib64/
|
| 28 |
+
parts/
|
| 29 |
+
sdist/
|
| 30 |
+
var/
|
| 31 |
+
wheels/
|
| 32 |
+
pip-wheel-metadata/
|
| 33 |
+
share/python-wheels/
|
| 34 |
+
*.egg-info/
|
| 35 |
+
.installed.cfg
|
| 36 |
+
*.egg
|
| 37 |
+
MANIFEST
|
| 38 |
+
|
| 39 |
+
# PyInstaller
|
| 40 |
+
# Usually these files are written by a python script from a template
|
| 41 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 42 |
+
*.manifest
|
| 43 |
+
*.spec
|
| 44 |
+
|
| 45 |
+
# Installer logs
|
| 46 |
+
pip-log.txt
|
| 47 |
+
pip-delete-this-directory.txt
|
| 48 |
+
|
| 49 |
+
# Unit test / coverage reports
|
| 50 |
+
htmlcov/
|
| 51 |
+
.tox/
|
| 52 |
+
.nox/
|
| 53 |
+
.coverage
|
| 54 |
+
.coverage.*
|
| 55 |
+
.cache
|
| 56 |
+
nosetests.xml
|
| 57 |
+
coverage.xml
|
| 58 |
+
*.cover
|
| 59 |
+
*.py,cover
|
| 60 |
+
.hypothesis/
|
| 61 |
+
.pytest_cache/
|
| 62 |
+
|
| 63 |
+
# Translations
|
| 64 |
+
*.mo
|
| 65 |
+
*.pot
|
| 66 |
+
|
| 67 |
+
# Django stuff:
|
| 68 |
+
*.log
|
| 69 |
+
local_settings.py
|
| 70 |
+
db.sqlite3
|
| 71 |
+
db.sqlite3-journal
|
| 72 |
+
|
| 73 |
+
# Flask stuff:
|
| 74 |
+
instance/
|
| 75 |
+
.webassets-cache
|
| 76 |
+
|
| 77 |
+
# Scrapy stuff:
|
| 78 |
+
.scrapy
|
| 79 |
+
|
| 80 |
+
# Sphinx documentation
|
| 81 |
+
docs/_build/
|
| 82 |
+
|
| 83 |
+
# PyBuilder
|
| 84 |
+
target/
|
| 85 |
+
|
| 86 |
+
# Jupyter Notebook
|
| 87 |
+
.ipynb_checkpoints
|
| 88 |
+
|
| 89 |
+
# IPython
|
| 90 |
+
profile_default/
|
| 91 |
+
ipython_config.py
|
| 92 |
+
|
| 93 |
+
# pyenv
|
| 94 |
+
.python-version
|
| 95 |
+
|
| 96 |
+
# pipenv
|
| 97 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 98 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 99 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 100 |
+
# install all needed dependencies.
|
| 101 |
+
#Pipfile.lock
|
| 102 |
+
|
| 103 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 104 |
+
__pypackages__/
|
| 105 |
+
|
| 106 |
+
# Celery stuff
|
| 107 |
+
celerybeat-schedule
|
| 108 |
+
celerybeat.pid
|
| 109 |
+
|
| 110 |
+
# SageMath parsed files
|
| 111 |
+
*.sage.py
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# Environments
|
| 115 |
+
.env
|
| 116 |
+
.venv
|
| 117 |
+
env/
|
| 118 |
+
venv*/
|
| 119 |
+
ENV/
|
| 120 |
+
env.bak/
|
| 121 |
+
venv.bak/
|
| 122 |
+
|
| 123 |
+
# Spyder project settings
|
| 124 |
+
.spyderproject
|
| 125 |
+
.spyproject
|
| 126 |
+
|
| 127 |
+
# Rope project settings
|
| 128 |
+
.ropeproject
|
| 129 |
+
|
| 130 |
+
# mkdocs documentation
|
| 131 |
+
/site
|
| 132 |
+
|
| 133 |
+
# mypy
|
| 134 |
+
.mypy_cache/
|
| 135 |
+
.dmypy.json
|
| 136 |
+
dmypy.json
|
| 137 |
+
|
| 138 |
+
# Pyre type checker
|
| 139 |
+
.pyre/
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
primaryColor="#bc50ee"
|
| 3 |
+
backgroundColor="#333333"
|
| 4 |
+
secondaryBackgroundColor="#2a2a2b"
|
| 5 |
+
textColor="#fbfafa"
|
| 6 |
+
font = "sans serif"
|
| 7 |
+
|
| 8 |
+
[server]
|
| 9 |
+
runOnSave = false
|
| 10 |
+
headless = true
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2023 Sunil S. Singh
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Nlp - NER POS Text-Summarization Text-Classification
|
| 3 |
+
emoji: 🧾
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.27.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
<a href="https://huggingface.co/spaces/sssingh/nlp-ner-summarization-classification" target="_blank"><img src="https://img.shields.io/badge/click_here_to_open_streamlit_app-f63366?style=for-the-badge&logo=streamlit&logoColor=black" /></a>
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/title.png?raw=true" width="1000" height="350"/><br><br>
|
| 17 |
+
|
| 18 |
+
# Common NLP Tasks
|
| 19 |
+
***This app demonstrates common NLP techniques and use cases.***
|
| 20 |
+
>This app demonstrate typical NLP techniques used in real-world use cases, such as NER & POS Recognition, Text Summarization, and Text Classification.
|
| 21 |
+
|
| 22 |
+
* **`NER`** has applications in various industries for example in `finance` it can extract essential information from earnings reports, financial statements, news articles, and product mentions for automated analysis, fraud detection, and investment opportunities. In `media and entertainment`, it analyzes text for content creation and personalized content creation. In `e-commerce`, it extracts product information from reviews, customer feedback, and descriptions, enabling automated analysis and personalized recommendations.
|
| 23 |
+
|
| 24 |
+
* **`POS`** tagging is an essential NLP technique used in machine translation, word meaning disambiguation, question answering parsing, and so on.
|
| 25 |
+
|
| 26 |
+
* **`Text Summarization`** has plethora of use-cases in real world. For example `Media monitoring` for sensitive and objectional content, `Helping disabled people` in presenting only short and relevant content, producing succinct summary of `Meetings and video-conferencing`, Summarization of financial documents like earning reports and financial news to quickly derive market signals etc.
|
| 27 |
+
|
| 28 |
+
* **`Text Classification`** has multitude of applications such as `Categorizing customer support tickets` (billing, feedback, questions complaints etc), `sentiment analysis` (customer feedback, tweets etc), `Content moderation` (hate speech, obscene language, NSFW etc).
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# App UI Details
|
| 32 |
+
The app has four tabs: "ABOUT", "NER & POS", "TEXT SUMMARIZATION", and "TEXT CLASSIFICATION".
|
| 33 |
+
|
| 34 |
+
## ABOUT Tab
|
| 35 |
+
This page
|
| 36 |
+
|
| 37 |
+
## NER & POS Tab
|
| 38 |
+
Given a text fragment, named entities (NER) and parts of speech (POS) in the text can be extracted with a click of button:
|
| 39 |
+
|
| 40 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/ner.png?raw=true"/><br>
|
| 41 |
+
|
| 42 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/pos.png?raw=true"/>
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
Because of hardware resource constraints on public cloud hosting, the app uses a "small" language model to illustrate functionality that is far from ideal. A bigger model running on more capable hardware will yield much better results.
|
| 46 |
+
|
| 47 |
+
## TEXT SUMMARIZATION Tab
|
| 48 |
+
A brief text summary is generated from a specified text. The summation technique (TextRankSummarizer, LexRankSummarizer, LsaSummarizer) and the length of the summary text may be selected by the user.
|
| 49 |
+
|
| 50 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/summ1.png?raw=true"/>
|
| 51 |
+
|
| 52 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/summ2.png?raw=true"/>
|
| 53 |
+
|
| 54 |
+
## TEXT CLASSIFICATION Tab
|
| 55 |
+
The text classifier can determine the `emotion` portrayed by a sentence or paragraph given a sentence or paragraph. The `LogisticRegression` classifier is used to detect emotions in this app. The classifier was trained using labeled data from 34,000 samples.
|
| 56 |
+
|
| 57 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/emotion.png?raw=true"/>
|
| 58 |
+
|
| 59 |
+
Please keep in mind that this is far from flawless. Given the training and inference hardware restrictions, the corpus utilized for training is tiny, and the model employed is basic. Training on a much bigger text corpus and employing a model capable of classifying non-linear data (e.g., XGBoost, RandomForest, or a Neural Network) would provide significantly superior results.
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# Project Source
|
| 63 |
+
[👉 Visit GitHub Repo](https://github.com/sssingh/nlp_ner_summarization_classification)
|
| 64 |
+
|
| 65 |
+
# Contact Me
|
| 66 |
+
[](mailto:[email protected])
|
| 67 |
+
[](https://twitter.com/@thesssingh)
|
| 68 |
+
[](https://www.linkedin.com/in/sssingh/)
|
| 69 |
+
[](https://sunilssingh.me)
|
| 70 |
+
|
| 71 |
+
# Appendix
|
| 72 |
+
|
| 73 |
+
## Local Installation and Run
|
| 74 |
+
To run the app locally...
|
| 75 |
+
1. Create a conda or virtual environment and activate it
|
| 76 |
+
2. install python 3.11.0 or above
|
| 77 |
+
3. execute below commands from terminal/command-prompt
|
| 78 |
+
```
|
| 79 |
+
git clone https://github.com/sssingh/nlp_ner_summarization_classification
|
| 80 |
+
cd nlp_ner_summarization_classification
|
| 81 |
+
pip install -r requirements.txt
|
| 82 |
+
streamlit run src/app.py
|
| 83 |
+
```
|
| 84 |
+
4. Open any browser and then visit `localhost:8501`
|
| 85 |
+
|
| 86 |
+
NOTE: The trained text classifier is kept in `artifacts` folder as `logistic_regression_model.joblib` file. If you wish to re-train the model again and make changes to its hyperparameter (or use another classifier) then...
|
| 87 |
+
* Modify `src/logistic_regression_model.py` script
|
| 88 |
+
* execute below commands from terminal/command-prompt
|
| 89 |
+
```
|
| 90 |
+
pip install -r requirements.txt
|
| 91 |
+
streamlit run src/app.py
|
| 92 |
+
```
|
artifacts/about.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Common NLP Tasks
|
| 2 |
+
***This app demonstrates common NLP techniques and use cases.***
|
| 3 |
+
>This app demonstrate typical NLP techniques used in real-world use cases, such as NER & POS Recognition, Text Summarization, and Text Classification.
|
| 4 |
+
|
| 5 |
+
* **`NER`** has applications in various industries for example in `finance` it can extract essential information from earnings reports, financial statements, news articles, and product mentions for automated analysis, fraud detection, and investment opportunities. In `media and entertainment`, it analyzes text for content creation and personalized content creation. In `e-commerce`, it extracts product information from reviews, customer feedback, and descriptions, enabling automated analysis and personalized recommendations.
|
| 6 |
+
|
| 7 |
+
* **`POS`** tagging is an essential NLP technique used in machine translation, word meaning disambiguation, question answering parsing, and so on.
|
| 8 |
+
|
| 9 |
+
* **`Text Summarization`** has plethora of use-cases in real world. For example `Media monitoring` for sensitive and objectional content, `Helping disabled people` in presenting only short and relevant content, producing succinct summary of `Meetings and video-conferencing`, Summarization of financial documents like earning reports and financial news to quickly derive market signals etc.
|
| 10 |
+
|
| 11 |
+
* **`Text Classification`** has multitude of applications such as `Categorizing customer support tickets` (billing, feedback, questions complaints etc), `sentiment analysis` (customer feedback, tweets etc), `Content moderation` (hate speech, obscene language, NSFW etc).
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# App UI Details
|
| 15 |
+
The app has four tabs: "ABOUT", "NER & POS", "TEXT SUMMARIZATION", and "TEXT CLASSIFICATION".
|
| 16 |
+
|
| 17 |
+
## ABOUT Tab
|
| 18 |
+
This page
|
| 19 |
+
|
| 20 |
+
## NER & POS Tab
|
| 21 |
+
Given a text fragment, named entities (NER) and parts of speech (POS) in the text can be extracted with a click of button:
|
| 22 |
+
|
| 23 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/ner.png?raw=true"/><br>
|
| 24 |
+
|
| 25 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/pos.png?raw=true"/>
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
Because of hardware resource constraints on public cloud hosting, the app uses a "small" language model to illustrate functionality that is far from ideal. A bigger model running on more capable hardware will yield much better results.
|
| 29 |
+
|
| 30 |
+
## TEXT SUMMARIZATION Tab
|
| 31 |
+
A brief text summary is generated from a specified text. The summation technique (TextRankSummarizer, LexRankSummarizer, LsaSummarizer) and the length of the summary text may be selected by the user.
|
| 32 |
+
|
| 33 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/summ1.png?raw=true"/>
|
| 34 |
+
|
| 35 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/summ2.png?raw=true"/>
|
| 36 |
+
|
| 37 |
+
## TEXT CLASSIFICATION Tab
|
| 38 |
+
The text classifier can determine the `emotion` portrayed by a sentence or paragraph given a sentence or paragraph. The `LogisticRegression` classifier is used to detect emotions in this app. The classifier was trained using labeled data from 34,000 samples.
|
| 39 |
+
|
| 40 |
+
<img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/emotion.png?raw=true"/>
|
| 41 |
+
|
| 42 |
+
Please keep in mind that this is far from flawless. Given the training and inference hardware restrictions, the corpus utilized for training is tiny, and the model employed is basic. Training on a much bigger text corpus and employing a model capable of classifying non-linear data (e.g., XGBoost, RandomForest, or a Neural Network) would provide significantly superior results.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# Project Source
|
| 46 |
+
[👉 Visit GitHub Repo](https://github.com/sssingh/nlp_ner_summarization_classification)
|
| 47 |
+
|
| 48 |
+
# Contact Me
|
| 49 |
+
[](mailto:[email protected])
|
| 50 |
+
[](https://twitter.com/@thesssingh)
|
| 51 |
+
[](https://www.linkedin.com/in/sssingh/)
|
| 52 |
+
[](https://sunilssingh.me)
|
| 53 |
+
|
| 54 |
+
# Appendix
|
| 55 |
+
|
| 56 |
+
## Local Installation and Run
|
| 57 |
+
To run the app locally...
|
| 58 |
+
1. Create a conda or virtual environment and activate it
|
| 59 |
+
2. install python 3.11.0 or above
|
| 60 |
+
3. execute below commands from terminal/command-prompt
|
| 61 |
+
```
|
| 62 |
+
git clone https://github.com/sssingh/nlp_ner_summarization_classification
|
| 63 |
+
cd nlp_ner_summarization_classification
|
| 64 |
+
pip install -r requirements.txt
|
| 65 |
+
streamlit run src/app.py
|
| 66 |
+
```
|
| 67 |
+
4. Open any browser and then visit `localhost:8501`
|
| 68 |
+
|
| 69 |
+
NOTE: The trained text classifier is kept in `artifacts` folder as `logistic_regression_model.joblib` file. If you wish to re-train the model again and make changes to its hyperparameter (or use another classifier) then...
|
| 70 |
+
* Modify `src/logistic_regression_model.py` script
|
| 71 |
+
* execute below commands from terminal/command-prompt
|
| 72 |
+
```
|
| 73 |
+
pip install -r requirements.txt
|
| 74 |
+
streamlit run src/app.py
|
| 75 |
+
```
|
artifacts/lr_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:265aaad101032c44519b05554897b2963a795f737da0a2c2e8b4a1d3a2e58a20
|
| 3 |
+
size 2697989
|
assets/NLP.png
ADDED
|
Git LFS Details
|
assets/emotion.png
ADDED
|
Git LFS Details
|
assets/logo.png
ADDED
|
Git LFS Details
|
assets/ner.png
ADDED
|
Git LFS Details
|
assets/pos.png
ADDED
|
Git LFS Details
|
assets/summ1.png
ADDED
|
Git LFS Details
|
assets/summ2.png
ADDED
|
Git LFS Details
|
assets/title.png
ADDED
|
Git LFS Details
|
data/emotions.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/experiments.ipynb
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 106,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import subprocess\n",
|
| 10 |
+
"import spacy\n",
|
| 11 |
+
"from sumy.parsers.plaintext import PlaintextParser\n",
|
| 12 |
+
"from sumy.nlp.tokenizers import Tokenizer"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"cell_type": "code",
|
| 17 |
+
"execution_count": 94,
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"outputs": [
|
| 20 |
+
{
|
| 21 |
+
"name": "stdout",
|
| 22 |
+
"output_type": "stream",
|
| 23 |
+
"text": [
|
| 24 |
+
"Collecting en-core-web-sm==3.5.0\n",
|
| 25 |
+
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)\n",
|
| 26 |
+
"\u001b[2K ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 3.9 MB/s eta 0:00:00\n",
|
| 27 |
+
"\u001b[?25hRequirement already satisfied: spacy<3.6.0,>=3.5.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from en-core-web-sm==3.5.0) (3.5.3)\n",
|
| 28 |
+
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.12)\n",
|
| 29 |
+
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.4)\n",
|
| 30 |
+
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.7)\n",
|
| 31 |
+
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.6)\n",
|
| 32 |
+
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.6)\n",
|
| 33 |
+
"Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.10)\n",
|
| 34 |
+
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.9.1)\n",
|
| 35 |
+
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.4.6)\n",
|
| 36 |
+
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.7)\n",
|
| 37 |
+
"Requirement already satisfied: typer<0.8.0,>=0.3.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.4.1)\n",
|
| 38 |
+
"Requirement already satisfied: pathy>=0.10.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.10.1)\n",
|
| 39 |
+
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (5.2.1)\n",
|
| 40 |
+
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.65.0)\n",
|
| 41 |
+
"Requirement already satisfied: numpy>=1.15.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.24.3)\n",
|
| 42 |
+
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.29.0)\n",
|
| 43 |
+
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.10.8)\n",
|
| 44 |
+
"Requirement already satisfied: jinja2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.1.2)\n",
|
| 45 |
+
"Requirement already satisfied: setuptools in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (67.8.0)\n",
|
| 46 |
+
"Requirement already satisfied: packaging>=20.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (23.0)\n",
|
| 47 |
+
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.3.0)\n",
|
| 48 |
+
"Requirement already satisfied: typing-extensions>=4.2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.7.1)\n",
|
| 49 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.4)\n",
|
| 50 |
+
"Requirement already satisfied: idna<4,>=2.5 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.4)\n",
|
| 51 |
+
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.26.16)\n",
|
| 52 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2023.7.22)\n",
|
| 53 |
+
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.9)\n",
|
| 54 |
+
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.0.4)\n",
|
| 55 |
+
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.0.4)\n",
|
| 56 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.1.1)\n",
|
| 57 |
+
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
|
| 58 |
+
"You can now load the package via spacy.load('en_core_web_sm')\n"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"data": {
|
| 63 |
+
"text/plain": [
|
| 64 |
+
"CompletedProcess(args=['python', '-m', 'spacy', 'download', 'en_core_web_sm'], returncode=0)"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
"execution_count": 94,
|
| 68 |
+
"metadata": {},
|
| 69 |
+
"output_type": "execute_result"
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"source": [
|
| 73 |
+
"subprocess.run([\"python\", \"-m\", \"spacy\", \"download\", \"en_core_web_sm\"])"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"cell_type": "code",
|
| 78 |
+
"execution_count": 95,
|
| 79 |
+
"metadata": {},
|
| 80 |
+
"outputs": [
|
| 81 |
+
{
|
| 82 |
+
"data": {
|
| 83 |
+
"text/plain": [
|
| 84 |
+
"This is an example text in Singapore by Sunil Singh on 6th August 2023"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
"execution_count": 95,
|
| 88 |
+
"metadata": {},
|
| 89 |
+
"output_type": "execute_result"
|
| 90 |
+
}
|
| 91 |
+
],
|
| 92 |
+
"source": [
|
| 93 |
+
"nlp = spacy.load('en_core_web_sm')\n",
|
| 94 |
+
"doc = nlp(\"This is an example text in Singapore by Sunil Singh on 6th August 2023\")\n",
|
| 95 |
+
"doc"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": 96,
|
| 101 |
+
"metadata": {},
|
| 102 |
+
"outputs": [
|
| 103 |
+
{
|
| 104 |
+
"data": {
|
| 105 |
+
"text/plain": [
|
| 106 |
+
"'<!DOCTYPE html>\\n<html lang=\"en\">\\n <head>\\n <title>displaCy</title>\\n </head>\\n\\n <body style=\"font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, \\'Segoe UI\\', Helvetica, Arial, sans-serif, \\'Apple Color Emoji\\', \\'Segoe UI Emoji\\', \\'Segoe UI Symbol\\'; padding: 4rem 2rem; direction: ltr\">\\n<figure style=\"margin-bottom: 6rem\">\\n<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">This is an example text in \\n<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\\n Singapore\\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\\n</mark>\\n by \\n<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\\n Sunil Singh\\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\\n</mark>\\n on \\n<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\\n 6th August 2023\\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\\n</mark>\\n</div>\\n</figure>\\n</body>\\n</html>'"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
"execution_count": 96,
|
| 110 |
+
"metadata": {},
|
| 111 |
+
"output_type": "execute_result"
|
| 112 |
+
}
|
| 113 |
+
],
|
| 114 |
+
"source": [
|
| 115 |
+
"ner_html = displacy.render(docs=doc, style=\"ent\", jupyter=False, page=True)\n",
|
| 116 |
+
"ner_html"
|
| 117 |
+
]
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"cell_type": "code",
|
| 121 |
+
"execution_count": 97,
|
| 122 |
+
"metadata": {},
|
| 123 |
+
"outputs": [
|
| 124 |
+
{
|
| 125 |
+
"data": {
|
| 126 |
+
"text/html": [
|
| 127 |
+
"<div>\n",
|
| 128 |
+
"<style scoped>\n",
|
| 129 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 130 |
+
" vertical-align: middle;\n",
|
| 131 |
+
" }\n",
|
| 132 |
+
"\n",
|
| 133 |
+
" .dataframe tbody tr th {\n",
|
| 134 |
+
" vertical-align: top;\n",
|
| 135 |
+
" }\n",
|
| 136 |
+
"\n",
|
| 137 |
+
" .dataframe thead th {\n",
|
| 138 |
+
" text-align: right;\n",
|
| 139 |
+
" }\n",
|
| 140 |
+
"</style>\n",
|
| 141 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 142 |
+
" <thead>\n",
|
| 143 |
+
" <tr style=\"text-align: right;\">\n",
|
| 144 |
+
" <th></th>\n",
|
| 145 |
+
" <th>Entity Code</th>\n",
|
| 146 |
+
" <th>Entity Description</th>\n",
|
| 147 |
+
" </tr>\n",
|
| 148 |
+
" </thead>\n",
|
| 149 |
+
" <tbody>\n",
|
| 150 |
+
" <tr>\n",
|
| 151 |
+
" <th>0</th>\n",
|
| 152 |
+
" <td>DATE</td>\n",
|
| 153 |
+
" <td>Absolute or relative dates or periods</td>\n",
|
| 154 |
+
" </tr>\n",
|
| 155 |
+
" <tr>\n",
|
| 156 |
+
" <th>1</th>\n",
|
| 157 |
+
" <td>GPE</td>\n",
|
| 158 |
+
" <td>Countries, cities, states</td>\n",
|
| 159 |
+
" </tr>\n",
|
| 160 |
+
" </tbody>\n",
|
| 161 |
+
"</table>\n",
|
| 162 |
+
"</div>"
|
| 163 |
+
],
|
| 164 |
+
"text/plain": [
|
| 165 |
+
" Entity Code Entity Description\n",
|
| 166 |
+
"0 DATE Absolute or relative dates or periods\n",
|
| 167 |
+
"1 GPE Countries, cities, states"
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
"execution_count": 97,
|
| 171 |
+
"metadata": {},
|
| 172 |
+
"output_type": "execute_result"
|
| 173 |
+
}
|
| 174 |
+
],
|
| 175 |
+
"source": [
|
| 176 |
+
"import pandas as pd\n",
|
| 177 |
+
"label, desc = [],[]\n",
|
| 178 |
+
"for ent in doc.ents:\n",
|
| 179 |
+
" label.append(ent.label_)\n",
|
| 180 |
+
" desc.append(spacy.explain(ent.label_))\n",
|
| 181 |
+
"label, desc = list(set(label)), list(set(desc))\n",
|
| 182 |
+
"df = pd.DataFrame(data={\"Entity Code\":label, \"Entity Description\":desc})\n",
|
| 183 |
+
"df"
|
| 184 |
+
]
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"cell_type": "code",
|
| 188 |
+
"execution_count": 98,
|
| 189 |
+
"metadata": {},
|
| 190 |
+
"outputs": [
|
| 191 |
+
{
|
| 192 |
+
"data": {
|
| 193 |
+
"text/plain": [
|
| 194 |
+
"(Singapore, Sunil Singh, 6th August 2023)"
|
| 195 |
+
]
|
| 196 |
+
},
|
| 197 |
+
"execution_count": 98,
|
| 198 |
+
"metadata": {},
|
| 199 |
+
"output_type": "execute_result"
|
| 200 |
+
}
|
| 201 |
+
],
|
| 202 |
+
"source": [
|
| 203 |
+
"doc.ents"
|
| 204 |
+
]
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"cell_type": "code",
|
| 208 |
+
"execution_count": 99,
|
| 209 |
+
"metadata": {},
|
| 210 |
+
"outputs": [
|
| 211 |
+
{
|
| 212 |
+
"name": "stdout",
|
| 213 |
+
"output_type": "stream",
|
| 214 |
+
"text": [
|
| 215 |
+
"This PRON pronoun\n",
|
| 216 |
+
"is AUX auxiliary\n",
|
| 217 |
+
"an DET determiner\n",
|
| 218 |
+
"example NOUN noun\n",
|
| 219 |
+
"text NOUN noun\n",
|
| 220 |
+
"in ADP adposition\n",
|
| 221 |
+
"Singapore PROPN proper noun\n",
|
| 222 |
+
"by ADP adposition\n",
|
| 223 |
+
"Sunil PROPN proper noun\n",
|
| 224 |
+
"Singh PROPN proper noun\n",
|
| 225 |
+
"on ADP adposition\n",
|
| 226 |
+
"6th ADJ adjective\n",
|
| 227 |
+
"August PROPN proper noun\n",
|
| 228 |
+
"2023 NUM numeral\n"
|
| 229 |
+
]
|
| 230 |
+
}
|
| 231 |
+
],
|
| 232 |
+
"source": [
|
| 233 |
+
"for token in doc:\n",
|
| 234 |
+
" print(token.text, token.pos_, spacy.explain(token.pos_))"
|
| 235 |
+
]
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"cell_type": "code",
|
| 239 |
+
"execution_count": 100,
|
| 240 |
+
"metadata": {},
|
| 241 |
+
"outputs": [
|
| 242 |
+
{
|
| 243 |
+
"data": {
|
| 244 |
+
"text/html": [
|
| 245 |
+
"<div>\n",
|
| 246 |
+
"<style scoped>\n",
|
| 247 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 248 |
+
" vertical-align: middle;\n",
|
| 249 |
+
" }\n",
|
| 250 |
+
"\n",
|
| 251 |
+
" .dataframe tbody tr th {\n",
|
| 252 |
+
" vertical-align: top;\n",
|
| 253 |
+
" }\n",
|
| 254 |
+
"\n",
|
| 255 |
+
" .dataframe thead th {\n",
|
| 256 |
+
" text-align: right;\n",
|
| 257 |
+
" }\n",
|
| 258 |
+
"</style>\n",
|
| 259 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 260 |
+
" <thead>\n",
|
| 261 |
+
" <tr style=\"text-align: right;\">\n",
|
| 262 |
+
" <th></th>\n",
|
| 263 |
+
" <th>Token</th>\n",
|
| 264 |
+
" <th>Tag</th>\n",
|
| 265 |
+
" <th>Pos</th>\n",
|
| 266 |
+
" <th>Description</th>\n",
|
| 267 |
+
" </tr>\n",
|
| 268 |
+
" </thead>\n",
|
| 269 |
+
" <tbody>\n",
|
| 270 |
+
" <tr>\n",
|
| 271 |
+
" <th>0</th>\n",
|
| 272 |
+
" <td>example</td>\n",
|
| 273 |
+
" <td>NN</td>\n",
|
| 274 |
+
" <td>NOUN</td>\n",
|
| 275 |
+
" <td>noun, singular or mass</td>\n",
|
| 276 |
+
" </tr>\n",
|
| 277 |
+
" <tr>\n",
|
| 278 |
+
" <th>1</th>\n",
|
| 279 |
+
" <td>text</td>\n",
|
| 280 |
+
" <td>NN</td>\n",
|
| 281 |
+
" <td>NOUN</td>\n",
|
| 282 |
+
" <td>noun, singular or mass</td>\n",
|
| 283 |
+
" </tr>\n",
|
| 284 |
+
" <tr>\n",
|
| 285 |
+
" <th>2</th>\n",
|
| 286 |
+
" <td>Singapore</td>\n",
|
| 287 |
+
" <td>NNP</td>\n",
|
| 288 |
+
" <td>PROPN</td>\n",
|
| 289 |
+
" <td>noun, proper singular</td>\n",
|
| 290 |
+
" </tr>\n",
|
| 291 |
+
" <tr>\n",
|
| 292 |
+
" <th>3</th>\n",
|
| 293 |
+
" <td>Sunil</td>\n",
|
| 294 |
+
" <td>NNP</td>\n",
|
| 295 |
+
" <td>PROPN</td>\n",
|
| 296 |
+
" <td>noun, proper singular</td>\n",
|
| 297 |
+
" </tr>\n",
|
| 298 |
+
" <tr>\n",
|
| 299 |
+
" <th>4</th>\n",
|
| 300 |
+
" <td>Singh</td>\n",
|
| 301 |
+
" <td>NNP</td>\n",
|
| 302 |
+
" <td>PROPN</td>\n",
|
| 303 |
+
" <td>noun, proper singular</td>\n",
|
| 304 |
+
" </tr>\n",
|
| 305 |
+
" <tr>\n",
|
| 306 |
+
" <th>5</th>\n",
|
| 307 |
+
" <td>6th</td>\n",
|
| 308 |
+
" <td>JJ</td>\n",
|
| 309 |
+
" <td>ADJ</td>\n",
|
| 310 |
+
" <td>adjective (English), other noun-modifier (Chin...</td>\n",
|
| 311 |
+
" </tr>\n",
|
| 312 |
+
" <tr>\n",
|
| 313 |
+
" <th>6</th>\n",
|
| 314 |
+
" <td>August</td>\n",
|
| 315 |
+
" <td>NNP</td>\n",
|
| 316 |
+
" <td>PROPN</td>\n",
|
| 317 |
+
" <td>noun, proper singular</td>\n",
|
| 318 |
+
" </tr>\n",
|
| 319 |
+
" <tr>\n",
|
| 320 |
+
" <th>7</th>\n",
|
| 321 |
+
" <td>2023</td>\n",
|
| 322 |
+
" <td>CD</td>\n",
|
| 323 |
+
" <td>NUM</td>\n",
|
| 324 |
+
" <td>cardinal number</td>\n",
|
| 325 |
+
" </tr>\n",
|
| 326 |
+
" </tbody>\n",
|
| 327 |
+
"</table>\n",
|
| 328 |
+
"</div>"
|
| 329 |
+
],
|
| 330 |
+
"text/plain": [
|
| 331 |
+
" Token Tag Pos Description\n",
|
| 332 |
+
"0 example NN NOUN noun, singular or mass\n",
|
| 333 |
+
"1 text NN NOUN noun, singular or mass\n",
|
| 334 |
+
"2 Singapore NNP PROPN noun, proper singular\n",
|
| 335 |
+
"3 Sunil NNP PROPN noun, proper singular\n",
|
| 336 |
+
"4 Singh NNP PROPN noun, proper singular\n",
|
| 337 |
+
"5 6th JJ ADJ adjective (English), other noun-modifier (Chin...\n",
|
| 338 |
+
"6 August NNP PROPN noun, proper singular\n",
|
| 339 |
+
"7 2023 CD NUM cardinal number"
|
| 340 |
+
]
|
| 341 |
+
},
|
| 342 |
+
"execution_count": 100,
|
| 343 |
+
"metadata": {},
|
| 344 |
+
"output_type": "execute_result"
|
| 345 |
+
}
|
| 346 |
+
],
|
| 347 |
+
"source": [
|
| 348 |
+
"word, tag, pos, desc = [], [], [], []\n",
|
| 349 |
+
"for token in doc:\n",
|
| 350 |
+
" if token.is_stop or token.is_punct:\n",
|
| 351 |
+
" continue\n",
|
| 352 |
+
" word.append(str(token))\n",
|
| 353 |
+
" tag.append(str(token.tag_))\n",
|
| 354 |
+
" pos.append(token.pos_)\n",
|
| 355 |
+
" desc.append(spacy.explain(token.tag_))\n",
|
| 356 |
+
"pd.DataFrame(data=dict(Token=word, Tag=tag, Pos=pos, Description=desc))\n"
|
| 357 |
+
]
|
| 358 |
+
},
|
| 359 |
+
{
|
| 360 |
+
"cell_type": "code",
|
| 361 |
+
"execution_count": 101,
|
| 362 |
+
"metadata": {},
|
| 363 |
+
"outputs": [],
|
| 364 |
+
"source": [
|
| 365 |
+
"import sys\n",
|
| 366 |
+
"from sumy.parsers.plaintext import PlaintextParser\n",
|
| 367 |
+
"from sumy.nlp.tokenizers import Tokenizer\n",
|
| 368 |
+
"from sumy.summarizers.text_rank import TextRankSummarizer\n",
|
| 369 |
+
"from sumy.summarizers.lex_rank import LexRankSummarizer\n",
|
| 370 |
+
"from sumy.summarizers.lsa import LsaSummarizer\n",
|
| 371 |
+
"from dataclasses import dataclass\n",
|
| 372 |
+
"@dataclass\n",
|
| 373 |
+
"class __AppConfig:\n",
|
| 374 |
+
" \"\"\"app-wide configurations\"\"\"\n",
|
| 375 |
+
" summarizers = dict(\n",
|
| 376 |
+
" TextRankSummarizer=\"sumy.summarizers.text_rank\",\n",
|
| 377 |
+
" LexRankSummarizer=\"sumy.summarizers.lex_rank\",\n",
|
| 378 |
+
" LsaSummarizer=\"sumy.summarizers.lsa\",\n",
|
| 379 |
+
" )\n",
|
| 380 |
+
"### make configs available to any module that imports this module\n",
|
| 381 |
+
"app_config = __AppConfig()"
|
| 382 |
+
]
|
| 383 |
+
},
|
| 384 |
+
{
|
| 385 |
+
"cell_type": "code",
|
| 386 |
+
"execution_count": 102,
|
| 387 |
+
"metadata": {},
|
| 388 |
+
"outputs": [],
|
| 389 |
+
"source": [
|
| 390 |
+
"def class_from_name(module, class_name):\n",
|
| 391 |
+
" return getattr(module, class_name)"
|
| 392 |
+
]
|
| 393 |
+
},
|
| 394 |
+
{
|
| 395 |
+
"cell_type": "code",
|
| 396 |
+
"execution_count": 103,
|
| 397 |
+
"metadata": {},
|
| 398 |
+
"outputs": [],
|
| 399 |
+
"source": [
|
| 400 |
+
"method=\"TextRankSummarizer\"\n",
|
| 401 |
+
"def get_summarizer(method):\n",
|
| 402 |
+
" module=sys.modules[app_config.summarizers.get(method)]\n",
|
| 403 |
+
" summarizer = class_from_name(module, method)\n",
|
| 404 |
+
" return summarizer"
|
| 405 |
+
]
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"cell_type": "code",
|
| 409 |
+
"execution_count": 108,
|
| 410 |
+
"metadata": {},
|
| 411 |
+
"outputs": [],
|
| 412 |
+
"source": [
|
| 413 |
+
"text = \"\"\"Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.\n",
|
| 414 |
+
"Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. Interstellar uses extensive practical and miniature effects, and the company Double Negative created additional digital effects.\"\"\""
|
| 415 |
+
]
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"cell_type": "code",
|
| 419 |
+
"execution_count": 109,
|
| 420 |
+
"metadata": {},
|
| 421 |
+
"outputs": [
|
| 422 |
+
{
|
| 423 |
+
"data": {
|
| 424 |
+
"text/plain": [
|
| 425 |
+
"<sumy.parsers.plaintext.PlaintextParser at 0x7fa774f4a510>"
|
| 426 |
+
]
|
| 427 |
+
},
|
| 428 |
+
"execution_count": 109,
|
| 429 |
+
"metadata": {},
|
| 430 |
+
"output_type": "execute_result"
|
| 431 |
+
}
|
| 432 |
+
],
|
| 433 |
+
"source": [
|
| 434 |
+
"parser = PlaintextParser.from_string(text, Tokenizer(\"english\"))\n",
|
| 435 |
+
"parser"
|
| 436 |
+
]
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"cell_type": "code",
|
| 440 |
+
"execution_count": 117,
|
| 441 |
+
"metadata": {},
|
| 442 |
+
"outputs": [
|
| 443 |
+
{
|
| 444 |
+
"data": {
|
| 445 |
+
"text/plain": [
|
| 446 |
+
"8"
|
| 447 |
+
]
|
| 448 |
+
},
|
| 449 |
+
"execution_count": 117,
|
| 450 |
+
"metadata": {},
|
| 451 |
+
"output_type": "execute_result"
|
| 452 |
+
}
|
| 453 |
+
],
|
| 454 |
+
"source": [
|
| 455 |
+
"parser.document.sentences"
|
| 456 |
+
]
|
| 457 |
+
}
|
| 458 |
+
],
|
| 459 |
+
"metadata": {
|
| 460 |
+
"kernelspec": {
|
| 461 |
+
"display_name": "nlp",
|
| 462 |
+
"language": "python",
|
| 463 |
+
"name": "python3"
|
| 464 |
+
},
|
| 465 |
+
"language_info": {
|
| 466 |
+
"codemirror_mode": {
|
| 467 |
+
"name": "ipython",
|
| 468 |
+
"version": 3
|
| 469 |
+
},
|
| 470 |
+
"file_extension": ".py",
|
| 471 |
+
"mimetype": "text/x-python",
|
| 472 |
+
"name": "python",
|
| 473 |
+
"nbconvert_exporter": "python",
|
| 474 |
+
"pygments_lexer": "ipython3",
|
| 475 |
+
"version": "3.11.0"
|
| 476 |
+
},
|
| 477 |
+
"orig_nbformat": 4
|
| 478 |
+
},
|
| 479 |
+
"nbformat": 4,
|
| 480 |
+
"nbformat_minor": 2
|
| 481 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scikit-learn==1.2.*
|
| 2 |
+
joblib==1.2.*
|
| 3 |
+
pandas==1.5.*
|
| 4 |
+
streamlit==1.24.*
|
| 5 |
+
spacy==3.5.0
|
| 6 |
+
sumy==0.11.*
|
| 7 |
+
neattext==0.1.*
|
| 8 |
+
plotly==5.9.*
|
| 9 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0.tar.gz
|
src/app.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application entry point, global configuration, application structure"""
|
| 2 |
+
|
| 3 |
+
from config import app_config
|
| 4 |
+
import data
|
| 5 |
+
import utils
|
| 6 |
+
import tab_about
|
| 7 |
+
import tab_ner
|
| 8 |
+
import tab_emotions as tab_emotions
|
| 9 |
+
import tab_summarization
|
| 10 |
+
import streamlit as st
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def init():
|
| 14 |
+
### setup app-wide configuration
|
| 15 |
+
utils.setup_app(app_config)
|
| 16 |
+
|
| 17 |
+
### load data only once and cache it
|
| 18 |
+
nlp = data.load_lang_model(app_config.spacy_lang_model)
|
| 19 |
+
data.load_nltk_punkt()
|
| 20 |
+
df = data.load_emotions_data(app_config.emotions_data_file)
|
| 21 |
+
|
| 22 |
+
### initialize session state
|
| 23 |
+
|
| 24 |
+
### setup app tab structure
|
| 25 |
+
about, ner, summarization, sentiment = utils.create_tabs(
|
| 26 |
+
["ABOUT 👋", "NER & POS 🔍", "TEXT SUMMARIZATION 📝", "TEXT CLASSIFICATION 📑"]
|
| 27 |
+
)
|
| 28 |
+
with about:
|
| 29 |
+
tab_about.render()
|
| 30 |
+
with ner:
|
| 31 |
+
tab_ner.render(nlp)
|
| 32 |
+
with summarization:
|
| 33 |
+
tab_summarization.render()
|
| 34 |
+
with sentiment:
|
| 35 |
+
tab_emotions.render(df)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
init()
|
src/config.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""All app-specific user defined configurations are defined here"""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from sumy.summarizers.text_rank import TextRankSummarizer
|
| 6 |
+
from sumy.summarizers.lex_rank import LexRankSummarizer
|
| 7 |
+
from sumy.summarizers.lsa import LsaSummarizer
|
| 8 |
+
import plotly.express as px
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
### define all plotting configuration here,
|
| 12 |
+
### should not be accessed and changed directly hence leading "__"
|
| 13 |
+
@dataclass
|
| 14 |
+
class __PlotConfig:
|
| 15 |
+
"""All plotting configurations are defined here"""
|
| 16 |
+
|
| 17 |
+
# Available themes (templates):
|
| 18 |
+
# ['ggplot2', 'seaborn', 'simple_white', 'plotly',
|
| 19 |
+
# 'plotly_white', 'plotly_dark', 'presentation',
|
| 20 |
+
# 'xgridoff', 'ygridoff', 'gridon', 'none']
|
| 21 |
+
theme = "plotly_dark"
|
| 22 |
+
cat_color_map = px.colors.qualitative.T10
|
| 23 |
+
cat_color_map_r = px.colors.qualitative.T10_r
|
| 24 |
+
cont_color_map = px.colors.sequential.amp
|
| 25 |
+
cont_color_map_r = px.colors.sequential.amp_r
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
### define all app-wide configuration here,
|
| 29 |
+
### should not be accessed and changed directly hence leading "__"
|
| 30 |
+
@dataclass
|
| 31 |
+
class __AppConfig:
|
| 32 |
+
"""app-wide configurations"""
|
| 33 |
+
|
| 34 |
+
# get current working directory
|
| 35 |
+
cwd = os.getcwd()
|
| 36 |
+
banner_image_file = f"{cwd}/"
|
| 37 |
+
logo_image_file = f"{cwd}/assets/logo.png"
|
| 38 |
+
app_icon_file = f"{cwd}/assets/NLP.png"
|
| 39 |
+
app_title = "Applications"
|
| 40 |
+
readme_file_path = f"{cwd}/artifacts/about.md"
|
| 41 |
+
app_short_desc = "For common NLP use cases"
|
| 42 |
+
emotions_data_file = f"{cwd}/data/emotions.csv"
|
| 43 |
+
emoji_map = {
|
| 44 |
+
"joy": "😃",
|
| 45 |
+
"anger": "😡",
|
| 46 |
+
"disgust": "🤮",
|
| 47 |
+
"fear": "😨",
|
| 48 |
+
"neutral": "😐",
|
| 49 |
+
"sadness": "😔",
|
| 50 |
+
"shame": "🫣",
|
| 51 |
+
"surprise": "😲",
|
| 52 |
+
}
|
| 53 |
+
model_file = f"{cwd}/artifacts/lr_model.joblib"
|
| 54 |
+
sidebar_state = "expanded" # collapsed
|
| 55 |
+
layout = "centered" # wide
|
| 56 |
+
icon_question = "❓"
|
| 57 |
+
icon_important = "🎯"
|
| 58 |
+
icon_info = "ℹ️"
|
| 59 |
+
icon_stop = "⛔"
|
| 60 |
+
icon_about = "👋"
|
| 61 |
+
spacy_lang_model = "en_core_web_sm"
|
| 62 |
+
# sumy summarizers
|
| 63 |
+
summarizers = dict(
|
| 64 |
+
TextRankSummarizer={
|
| 65 |
+
"module": "sumy.summarizers.text_rank",
|
| 66 |
+
"desc": (
|
| 67 |
+
"**`TextRank`** is a graph based ranking algorithm. Read this article"
|
| 68 |
+
+ " https://blogs.cornell.edu/info2040/2018/10/22/40068/"
|
| 69 |
+
+ " to get a good intuition behind it"
|
| 70 |
+
),
|
| 71 |
+
},
|
| 72 |
+
LexRankSummarizer={
|
| 73 |
+
"module": "sumy.summarizers.lex_rank",
|
| 74 |
+
"desc": (
|
| 75 |
+
"**`LexRank`** is another graph based ranking algorithm. Read this"
|
| 76 |
+
+ " https://github.com/crabcamp/lexrank"
|
| 77 |
+
+ " to get a good intuition behind it"
|
| 78 |
+
),
|
| 79 |
+
},
|
| 80 |
+
LsaSummarizer={
|
| 81 |
+
"module": "sumy.summarizers.lsa",
|
| 82 |
+
"desc": (
|
| 83 |
+
"**`LSA`** or Latent Semantic Analysis uses word frequency and Singular"
|
| 84 |
+
+ " Value Decomposition (SVD). Read this"
|
| 85 |
+
+ " https://www.analyticsvidhya.com/blog/2021/09/latent-semantic-analysis-and-its-uses-in-natural-language-processing/ article"
|
| 86 |
+
+ " to get a good intuition behind it"
|
| 87 |
+
),
|
| 88 |
+
},
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
### make configs available to any module that imports this module
|
| 93 |
+
app_config = __AppConfig()
|
| 94 |
+
plot_config = __PlotConfig
|
src/data.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""All app-specific data and disk-IO related functionality implemented here"""
|
| 2 |
+
|
| 3 |
+
import subprocess
|
| 4 |
+
import joblib
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import neattext.functions as nfx
|
| 7 |
+
import nltk
|
| 8 |
+
import spacy
|
| 9 |
+
import streamlit as st
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@st.cache_resource
|
| 13 |
+
def load_lang_model(model):
|
| 14 |
+
"""Download and then instantiate then language model"""
|
| 15 |
+
# subprocess.run(["python", "-m", "spacy", "download", model])
|
| 16 |
+
nlp = spacy.load(model)
|
| 17 |
+
return nlp
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@st.cache_resource
|
| 21 |
+
def load_nltk_punkt():
|
| 22 |
+
"""Downloads NLTK tokenizers"""
|
| 23 |
+
nltk.download("punkt")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@st.cache_resource
|
| 27 |
+
def load_emotions_data(data_file_path):
|
| 28 |
+
"""Reads a given data-file and returns a DataFrame"""
|
| 29 |
+
return pd.read_csv(data_file_path)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def preprocess_data(df):
|
| 33 |
+
"""Cleans and transforms data"""
|
| 34 |
+
df["Clean_Text"] = df["Text"].apply(nfx.remove_userhandles)
|
| 35 |
+
df["Clean_Text"] = df["Clean_Text"].apply(nfx.remove_stopwords)
|
| 36 |
+
df["Clean_Text"] = df["Clean_Text"].apply(nfx.remove_urls)
|
| 37 |
+
df["Clean_Text"] = df["Clean_Text"].apply(nfx.remove_punctuations)
|
| 38 |
+
return df
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def preprocess_pred_data(input_data):
|
| 42 |
+
input_data = nfx.remove_userhandles(input_data)
|
| 43 |
+
input_data = nfx.remove_stopwords(input_data)
|
| 44 |
+
input_data = nfx.remove_urls(input_data)
|
| 45 |
+
input_data = nfx.remove_punctuations(input_data)
|
| 46 |
+
return [input_data]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def save_model(model_obj, model_file_path):
|
| 50 |
+
joblib.dump(value=model_obj, filename=model_file_path)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@st.cache_resource
|
| 54 |
+
def load_model(model_file_path):
|
| 55 |
+
return joblib.load(model_file_path)
|
src/emotion_classification.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import data
|
| 2 |
+
import config
|
| 3 |
+
from sklearn.pipeline import Pipeline
|
| 4 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 5 |
+
from sklearn.linear_model import LogisticRegression
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def __train_model(df, full=False):
|
| 10 |
+
### train on full data for final model else split the data and then train
|
| 11 |
+
if full:
|
| 12 |
+
X_train = df["Clean_Text"]
|
| 13 |
+
y_train = df["Emotion"]
|
| 14 |
+
else:
|
| 15 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 16 |
+
df["Clean_Text"], df["Emotion"], test_size=0.2, random_state=42
|
| 17 |
+
)
|
| 18 |
+
### build model pipeline
|
| 19 |
+
lr_pipeline = Pipeline(
|
| 20 |
+
steps=[("cv", CountVectorizer()), ("lr", LogisticRegression(max_iter=300))]
|
| 21 |
+
)
|
| 22 |
+
### train and test the model
|
| 23 |
+
print(f"\nTraining LogisticRegression with {X_train.shape[0]} samples...")
|
| 24 |
+
lr_pipeline.fit(X_train, y_train)
|
| 25 |
+
if not full:
|
| 26 |
+
print(f"Testing LogisticRegression with {X_test.shape[0]} samples...")
|
| 27 |
+
score = lr_pipeline.score(X_test, y_test)
|
| 28 |
+
print(f"Accuracy achieved: [{score*100:.2f}%].")
|
| 29 |
+
return lr_pipeline
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
emotions_df = data.load_emotions_data(config.app_config.emotions_data_file)
|
| 34 |
+
emotions_df = data.preprocess_data(emotions_df)
|
| 35 |
+
model = __train_model(emotions_df, full=True)
|
| 36 |
+
data.save_model(model, config.app_config.model_file)
|
| 37 |
+
print(f"Saved model to: [{config.app_config.model_file}]")
|
| 38 |
+
|
| 39 |
+
### Test code
|
| 40 |
+
# model = data.load_model(config.app_config.model_file)
|
| 41 |
+
# test_text = "I am loving NLP and it makes me feel so good"
|
| 42 |
+
# print(f"\nTesting model with sample text '{test_text}'\nPrediction:")
|
| 43 |
+
# print(model.predict([test_text]))
|
| 44 |
+
# print(model.classes_)
|
| 45 |
+
# print(model.predict_proba([test_text]))
|
src/plot.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from config import plot_config
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
import plotly.io as io
|
| 5 |
+
|
| 6 |
+
# setup app-wide plotly theme
|
| 7 |
+
io.templates.default = plot_config.theme
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def plot_proba(classes, proba):
|
| 11 |
+
df_proba = pd.DataFrame({"Emotions": classes, "Probability": proba})
|
| 12 |
+
df_proba["Emotions"] = df_proba["Emotions"].str.upper()
|
| 13 |
+
df_proba = df_proba.sort_values(by="Probability", ascending=False)
|
| 14 |
+
fig = px.bar(
|
| 15 |
+
data_frame=df_proba,
|
| 16 |
+
x="Probability",
|
| 17 |
+
y="Emotions",
|
| 18 |
+
color="Emotions",
|
| 19 |
+
title="Prediction Probabilities",
|
| 20 |
+
color_discrete_sequence=plot_config.cat_color_map,
|
| 21 |
+
)
|
| 22 |
+
return fig
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def plot_class_dist(df):
|
| 26 |
+
df_count = pd.DataFrame(df["Emotion"].value_counts()).reset_index()
|
| 27 |
+
df_count.columns = ["Emotions", "Count"]
|
| 28 |
+
df_count["Emotions"] = df_count["Emotions"].str.upper()
|
| 29 |
+
fig = px.bar(
|
| 30 |
+
data_frame=df_count,
|
| 31 |
+
x="Emotions",
|
| 32 |
+
y="Count",
|
| 33 |
+
color="Emotions",
|
| 34 |
+
title="Class Distribution",
|
| 35 |
+
color_discrete_sequence=plot_config.cat_color_map,
|
| 36 |
+
)
|
| 37 |
+
return fig
|
src/tab_about.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""About tab rendering functionality"""
|
| 2 |
+
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from config import app_config
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
###
|
| 8 |
+
### INTERNAL FUNCTIONS
|
| 9 |
+
###
|
| 10 |
+
def __section(header):
|
| 11 |
+
"""Render the section on this page"""
|
| 12 |
+
st.header(header)
|
| 13 |
+
with open(app_config.readme_file_path, "r") as f:
|
| 14 |
+
about = f.read()
|
| 15 |
+
st.markdown(about, unsafe_allow_html=True)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
###
|
| 19 |
+
### MAIN FLOW, entry point
|
| 20 |
+
###
|
| 21 |
+
def render():
|
| 22 |
+
__section("About The App")
|
src/tab_emotions.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Summary tab rendering functionality"""
|
| 2 |
+
|
| 3 |
+
from config import app_config
|
| 4 |
+
import plot
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import data
|
| 7 |
+
import utils
|
| 8 |
+
from config import app_config
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
###
|
| 12 |
+
### INTERNAL FUNCTIONS
|
| 13 |
+
###
|
| 14 |
+
def __section(header, df):
|
| 15 |
+
"""Build page UI elements"""
|
| 16 |
+
st.header(header)
|
| 17 |
+
### accept text input, make prediction and show results
|
| 18 |
+
st.write(
|
| 19 |
+
"`Enter the text` to be classified in the text area and then click `Detect`"
|
| 20 |
+
)
|
| 21 |
+
text = st.text_area("Enter Text:", height=200)
|
| 22 |
+
if st.button("Predict"):
|
| 23 |
+
model = data.load_model(app_config.model_file)
|
| 24 |
+
pred, pred_proba = utils.make_prediction(model, text, proba=True)
|
| 25 |
+
pred_col, conf_col = st.columns(2)
|
| 26 |
+
with pred_col:
|
| 27 |
+
emotion = pred[0]
|
| 28 |
+
st.success(
|
| 29 |
+
f"Detected Emotion: {emotion.upper()} {app_config.emoji_map[emotion]}"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
with conf_col:
|
| 33 |
+
st.success(f"Confidence: {pred_proba.max():.2f}%")
|
| 34 |
+
fig = plot.plot_proba(model.classes_, pred_proba)
|
| 35 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 36 |
+
|
| 37 |
+
### Supplementary details about the model used
|
| 38 |
+
st.divider()
|
| 39 |
+
with st.expander("Supplementary under-the-hood details:"):
|
| 40 |
+
st.info(
|
| 41 |
+
body="""
|
| 42 |
+
A trained LogisticRegression model is used here for emotion detection. The model
|
| 43 |
+
has been trained on a labeled data of 34,000 samples. Sample data and class
|
| 44 |
+
distribution is shown below.
|
| 45 |
+
""",
|
| 46 |
+
icon=app_config.icon_info,
|
| 47 |
+
)
|
| 48 |
+
st.dataframe(df.loc[:15, ["Clean_Text", "Emotion"]])
|
| 49 |
+
fig = plot.plot_class_dist(df)
|
| 50 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
###
|
| 54 |
+
### MAIN FLOW, entry point
|
| 55 |
+
###
|
| 56 |
+
def render(df):
|
| 57 |
+
__section("Emotions Detection", df)
|
src/tab_ner.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Summary tab rendering functionality"""
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import spacy
|
| 5 |
+
from spacy import displacy
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
###
|
| 9 |
+
### INTERNAL FUNCTIONS
|
| 10 |
+
###
|
| 11 |
+
def __analyse(text, analysis_type, nlp):
|
| 12 |
+
"""Analyse and return the named-entity for the given text"""
|
| 13 |
+
doc = nlp(text)
|
| 14 |
+
### analyse based on type
|
| 15 |
+
if analysis_type == "NER":
|
| 16 |
+
heading = "Named Entity Recognition (NER)"
|
| 17 |
+
result = displacy.render(docs=doc, style="ent", jupyter=False)
|
| 18 |
+
label, desc = [], []
|
| 19 |
+
for ent in doc.ents:
|
| 20 |
+
label.append(ent.label_)
|
| 21 |
+
desc.append(spacy.explain(ent.label_))
|
| 22 |
+
df = pd.DataFrame(data={"Codes": label, "Description": desc})
|
| 23 |
+
df = df.drop_duplicates().reset_index()
|
| 24 |
+
elif analysis_type == "POS":
|
| 25 |
+
result = ""
|
| 26 |
+
word, tag, pos, desc = [], [], [], []
|
| 27 |
+
for token in doc:
|
| 28 |
+
if token.is_stop or token.is_punct:
|
| 29 |
+
continue
|
| 30 |
+
word.append(str(token))
|
| 31 |
+
tag.append(str(token.tag_))
|
| 32 |
+
pos.append(token.pos_)
|
| 33 |
+
desc.append(spacy.explain(token.tag_))
|
| 34 |
+
df = pd.DataFrame(data=dict(Token=word, Tag=tag, Pos=pos, Description=desc))
|
| 35 |
+
heading = "Parts of speech tagging (POS)"
|
| 36 |
+
return result, df, heading
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def __section(header, nlp):
|
| 40 |
+
"""Build page UI elements"""
|
| 41 |
+
st.header(header)
|
| 42 |
+
st.write(
|
| 43 |
+
"Choose the analysis-type (NER/POS) to be performed, "
|
| 44 |
+
+ "enter the text in the text area and then click Analyse"
|
| 45 |
+
)
|
| 46 |
+
analysis_type = st.radio(label="Type:", options=["NER", "POS"])
|
| 47 |
+
text = st.text_area("Enter text:", height=300)
|
| 48 |
+
### analyse the entered text and show the results
|
| 49 |
+
if st.button("Analyse"):
|
| 50 |
+
result, df, heading = __analyse(text, analysis_type, nlp)
|
| 51 |
+
st.subheader(heading)
|
| 52 |
+
st.divider()
|
| 53 |
+
st.write(result, unsafe_allow_html=True)
|
| 54 |
+
st.write(" ")
|
| 55 |
+
st.dataframe(df, use_container_width=True)
|
| 56 |
+
st.divider()
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
###
|
| 60 |
+
### MAIN FLOW, entry point
|
| 61 |
+
###
|
| 62 |
+
def render(nlp):
|
| 63 |
+
"""NER tab page"""
|
| 64 |
+
__section("Named Entity & Parts Of Speech Recognition", nlp)
|
src/tab_summarization.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Summary tab rendering functionality"""
|
| 2 |
+
|
| 3 |
+
from config import app_config
|
| 4 |
+
import utils
|
| 5 |
+
import sys
|
| 6 |
+
import streamlit as st
|
| 7 |
+
from sumy.parsers.plaintext import PlaintextParser
|
| 8 |
+
from sumy.nlp.tokenizers import Tokenizer
|
| 9 |
+
from sumy.evaluation import rouge_n
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
###
|
| 13 |
+
### INTERNAL FUNCTIONS
|
| 14 |
+
###
|
| 15 |
+
def __get_summarizer(summarizer_type):
|
| 16 |
+
"""Helper to get summarizer object given its name as string"""
|
| 17 |
+
summarizer_dict = app_config.summarizers.get(summarizer_type)
|
| 18 |
+
module = sys.modules[summarizer_dict["module"]]
|
| 19 |
+
summarizer = utils.get_class_from_name(module, summarizer_type)
|
| 20 |
+
desc = summarizer_dict["desc"]
|
| 21 |
+
return summarizer(), desc
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def __summarize(text, summarizer, n_sentences):
|
| 25 |
+
### instantiate the text parser, summarize text and return the summary text
|
| 26 |
+
parser = PlaintextParser.from_string(text, Tokenizer("english"))
|
| 27 |
+
summary_tuple = summarizer(parser.document, n_sentences)
|
| 28 |
+
summary_text = ""
|
| 29 |
+
for sentence in summary_tuple:
|
| 30 |
+
summary_text += str(sentence)
|
| 31 |
+
### compute length of sentences are ROUGE score for summary text
|
| 32 |
+
rouge = rouge_n(
|
| 33 |
+
evaluated_sentences=summary_tuple,
|
| 34 |
+
reference_sentences=parser.document.sentences,
|
| 35 |
+
n=2,
|
| 36 |
+
)
|
| 37 |
+
stats = f"""
|
| 38 |
+
Number of sentences in original text: **{len(parser.document.sentences)}**
|
| 39 |
+
Number of sentences in summary text: **{len(summary_tuple)}**
|
| 40 |
+
ROUGE (bi-gram) score: **{rouge}**
|
| 41 |
+
"""
|
| 42 |
+
return summary_text, stats
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def __section(header):
|
| 46 |
+
"""Build page UI elements"""
|
| 47 |
+
st.header(header)
|
| 48 |
+
st.write(
|
| 49 |
+
"Choose the `Summarization Method`, `Enter Text` in the text "
|
| 50 |
+
+ "area, choose the `Number Of Sentences` required in summary text "
|
| 51 |
+
+ "and then click `Summarize`"
|
| 52 |
+
)
|
| 53 |
+
summarizer_type = st.radio(
|
| 54 |
+
"Summarization Method:",
|
| 55 |
+
options=[
|
| 56 |
+
# "WordFrequency",
|
| 57 |
+
"TextRankSummarizer",
|
| 58 |
+
"LexRankSummarizer",
|
| 59 |
+
"LsaSummarizer",
|
| 60 |
+
],
|
| 61 |
+
)
|
| 62 |
+
### Based on type selected, fetch the summarizer object and show short description
|
| 63 |
+
summarizer, desc = __get_summarizer(summarizer_type)
|
| 64 |
+
st.info(body=f"{desc}", icon=app_config.icon_info)
|
| 65 |
+
text = st.text_area("Enter text:", height=300, key="summarization")
|
| 66 |
+
n_sentences = st.slider(
|
| 67 |
+
label="Number Of Sentences", min_value=1, max_value=10, value=3
|
| 68 |
+
)
|
| 69 |
+
### summarize the entered text and show the results
|
| 70 |
+
if st.button("Summarize"):
|
| 71 |
+
summary, stats = __summarize(text, summarizer, n_sentences)
|
| 72 |
+
st.divider()
|
| 73 |
+
st.subheader("Summary")
|
| 74 |
+
st.success(stats)
|
| 75 |
+
st.write(summary)
|
| 76 |
+
st.divider()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
###
|
| 80 |
+
### MAIN FLOW, entry point
|
| 81 |
+
###
|
| 82 |
+
def render():
|
| 83 |
+
__section("Text Summarization")
|
src/utils.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""App agnostic reusable utility functionality"""
|
| 2 |
+
|
| 3 |
+
from config import app_config
|
| 4 |
+
import data
|
| 5 |
+
from typing import List
|
| 6 |
+
from PIL import Image
|
| 7 |
+
import streamlit as st
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def setup_app(config):
|
| 11 |
+
"""Sets up all application icon, banner, title"""
|
| 12 |
+
st.set_page_config(
|
| 13 |
+
page_title=config.app_title,
|
| 14 |
+
page_icon=app_config.app_icon_file,
|
| 15 |
+
initial_sidebar_state=config.sidebar_state,
|
| 16 |
+
layout=config.layout,
|
| 17 |
+
)
|
| 18 |
+
### Logo and App title, description
|
| 19 |
+
with st.container():
|
| 20 |
+
app_icon, app_title, logo = st.columns([0.2, 0.9, 0.3])
|
| 21 |
+
app_icon.image(image=app_config.app_icon_file, width=80)
|
| 22 |
+
app_title.markdown(
|
| 23 |
+
f"<h1 style='text-align: left; color: #03989e;'>{app_config.app_title}</h1> ",
|
| 24 |
+
unsafe_allow_html=True,
|
| 25 |
+
)
|
| 26 |
+
app_title.markdown(
|
| 27 |
+
f"<p style='text-align: left;'>{app_config.app_short_desc}</p>",
|
| 28 |
+
unsafe_allow_html=True,
|
| 29 |
+
)
|
| 30 |
+
logo.image(image=app_config.logo_image_file, width=100)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def create_tabs(tabs: List[str]):
|
| 34 |
+
"""Creates streamlit tabs"""
|
| 35 |
+
return st.tabs(tabs)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def download_file(btn_label, data, file_name, mime_type):
|
| 39 |
+
"""Creates a download button for data download"""
|
| 40 |
+
st.download_button(label=btn_label, data=data, file_name=file_name, mime=mime_type)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_class_from_name(module: str, class_name: str):
|
| 44 |
+
"""Instantiates and return the class given the class name and its module as str"""
|
| 45 |
+
return getattr(module, class_name)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def make_prediction(model, input_data, proba=False):
|
| 49 |
+
"""
|
| 50 |
+
prediction pipeline for the model, model must have predict method and predict_proba
|
| 51 |
+
method if prediction probabilities to be returned
|
| 52 |
+
"""
|
| 53 |
+
### preprocess the input and return it in a shape suitable for this model
|
| 54 |
+
processed_input_data = data.preprocess_pred_data(input_data)
|
| 55 |
+
### call model's predict method
|
| 56 |
+
pred = model.predict(processed_input_data)
|
| 57 |
+
### call model's predict_proba method if required
|
| 58 |
+
pred_proba = []
|
| 59 |
+
if proba:
|
| 60 |
+
pred_proba = model.predict_proba(processed_input_data)
|
| 61 |
+
return pred, pred_proba.squeeze()
|