Sunil Surendra Singh commited on
Commit
769af1a
·
0 Parent(s):

First commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.parquet filter=lfs diff=lfs merge=lfs -text
2
+ *.joblib filter=lfs diff=lfs merge=lfs -text
3
+ *.png filter=lfs diff=lfs merge=lfs -text
4
+ *.gif filter=lfs diff=lfs merge=lfs -text
5
+ *.jpg filter=lfs diff=lfs merge=lfs -text
.github/workflows/main.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ # to run this workflow manually from the Actions tab
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ sync-to-hub:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v3
13
+ with:
14
+ fetch-depth: 0
15
+ lfs: true
16
+ - name: Push to hub
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: git push --force https://sssingh:[email protected]/spaces/sssingh/nlp-ner-summarization-classification main
.gitignore ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MY CHNAGES
2
+ scratch*.ipynb
3
+ venv*/
4
+ .vscode*
5
+ .vscode*/
6
+ .examples/
7
+
8
+ .swp
9
+
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ pip-wheel-metadata/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+
63
+ # Translations
64
+ *.mo
65
+ *.pot
66
+
67
+ # Django stuff:
68
+ *.log
69
+ local_settings.py
70
+ db.sqlite3
71
+ db.sqlite3-journal
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ target/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # IPython
90
+ profile_default/
91
+ ipython_config.py
92
+
93
+ # pyenv
94
+ .python-version
95
+
96
+ # pipenv
97
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
99
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
100
+ # install all needed dependencies.
101
+ #Pipfile.lock
102
+
103
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104
+ __pypackages__/
105
+
106
+ # Celery stuff
107
+ celerybeat-schedule
108
+ celerybeat.pid
109
+
110
+ # SageMath parsed files
111
+ *.sage.py
112
+
113
+
114
+ # Environments
115
+ .env
116
+ .venv
117
+ env/
118
+ venv*/
119
+ ENV/
120
+ env.bak/
121
+ venv.bak/
122
+
123
+ # Spyder project settings
124
+ .spyderproject
125
+ .spyproject
126
+
127
+ # Rope project settings
128
+ .ropeproject
129
+
130
+ # mkdocs documentation
131
+ /site
132
+
133
+ # mypy
134
+ .mypy_cache/
135
+ .dmypy.json
136
+ dmypy.json
137
+
138
+ # Pyre type checker
139
+ .pyre/
.streamlit/config.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#bc50ee"
3
+ backgroundColor="#333333"
4
+ secondaryBackgroundColor="#2a2a2b"
5
+ textColor="#fbfafa"
6
+ font = "sans serif"
7
+
8
+ [server]
9
+ runOnSave = false
10
+ headless = true
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Sunil S. Singh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Nlp - NER POS Text-Summarization Text-Classification
3
+ emoji: 🧾
4
+ colorFrom: red
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.27.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ <a href="https://huggingface.co/spaces/sssingh/nlp-ner-summarization-classification" target="_blank"><img src="https://img.shields.io/badge/click_here_to_open_streamlit_app-f63366?style=for-the-badge&logo=streamlit&logoColor=black" /></a>
14
+
15
+
16
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/title.png?raw=true" width="1000" height="350"/><br><br>
17
+
18
+ # Common NLP Tasks
19
+ ***This app demonstrates common NLP techniques and use cases.***
20
+ >This app demonstrate typical NLP techniques used in real-world use cases, such as NER & POS Recognition, Text Summarization, and Text Classification.
21
+
22
+ * **`NER`** has applications in various industries for example in `finance` it can extract essential information from earnings reports, financial statements, news articles, and product mentions for automated analysis, fraud detection, and investment opportunities. In `media and entertainment`, it analyzes text for content creation and personalized content creation. In `e-commerce`, it extracts product information from reviews, customer feedback, and descriptions, enabling automated analysis and personalized recommendations.
23
+
24
+ * **`POS`** tagging is an essential NLP technique used in machine translation, word meaning disambiguation, question answering parsing, and so on.
25
+
26
+ * **`Text Summarization`** has plethora of use-cases in real world. For example `Media monitoring` for sensitive and objectional content, `Helping disabled people` in presenting only short and relevant content, producing succinct summary of `Meetings and video-conferencing`, Summarization of financial documents like earning reports and financial news to quickly derive market signals etc.
27
+
28
+ * **`Text Classification`** has multitude of applications such as `Categorizing customer support tickets` (billing, feedback, questions complaints etc), `sentiment analysis` (customer feedback, tweets etc), `Content moderation` (hate speech, obscene language, NSFW etc).
29
+
30
+
31
+ # App UI Details
32
+ The app has four tabs: "ABOUT", "NER & POS", "TEXT SUMMARIZATION", and "TEXT CLASSIFICATION".
33
+
34
+ ## ABOUT Tab
35
+ This page
36
+
37
+ ## NER & POS Tab
38
+ Given a text fragment, named entities (NER) and parts of speech (POS) in the text can be extracted with a click of button:
39
+
40
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/ner.png?raw=true"/><br>
41
+
42
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/pos.png?raw=true"/>
43
+
44
+
45
+ Because of hardware resource constraints on public cloud hosting, the app uses a "small" language model to illustrate functionality that is far from ideal. A bigger model running on more capable hardware will yield much better results.
46
+
47
+ ## TEXT SUMMARIZATION Tab
48
+ A brief text summary is generated from a specified text. The summation technique (TextRankSummarizer, LexRankSummarizer, LsaSummarizer) and the length of the summary text may be selected by the user.
49
+
50
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/summ1.png?raw=true"/>
51
+
52
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/summ2.png?raw=true"/>
53
+
54
+ ## TEXT CLASSIFICATION Tab
55
+ The text classifier can determine the `emotion` portrayed by a sentence or paragraph given a sentence or paragraph. The `LogisticRegression` classifier is used to detect emotions in this app. The classifier was trained using labeled data from 34,000 samples.
56
+
57
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/emotion.png?raw=true"/>
58
+
59
+ Please keep in mind that this is far from flawless. Given the training and inference hardware restrictions, the corpus utilized for training is tiny, and the model employed is basic. Training on a much bigger text corpus and employing a model capable of classifying non-linear data (e.g., XGBoost, RandomForest, or a Neural Network) would provide significantly superior results.
60
+
61
+
62
+ # Project Source
63
+ [👉 Visit GitHub Repo](https://github.com/sssingh/nlp_ner_summarization_classification)
64
+
65
+ # Contact Me
66
+ [![email](https://img.shields.io/badge/Gmail-D14836?style=for-the-badge&logo=gmail&logoColor=white)](mailto:[email protected])
67
+ [![twitter](https://img.shields.io/badge/twitter-1DA1F2?style=for-the-badge&logo=twitter&logoColor=white)](https://twitter.com/@thesssingh)
68
+ [![linkedin](https://img.shields.io/badge/linkedin-0A66C2?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/sssingh/)
69
+ [![website](https://img.shields.io/badge/web_site-8B5BE8?style=for-the-badge&logo=ko-fi&logoColor=white)](https://sunilssingh.me)
70
+
71
+ # Appendix
72
+
73
+ ## Local Installation and Run
74
+ To run the app locally...
75
+ 1. Create a conda or virtual environment and activate it
76
+ 2. install python 3.11.0 or above
77
+ 3. execute below commands from terminal/command-prompt
78
+ ```
79
+ git clone https://github.com/sssingh/nlp_ner_summarization_classification
80
+ cd nlp_ner_summarization_classification
81
+ pip install -r requirements.txt
82
+ streamlit run src/app.py
83
+ ```
84
+ 4. Open any browser and then visit `localhost:8501`
85
+
86
+ NOTE: The trained text classifier is kept in `artifacts` folder as `logistic_regression_model.joblib` file. If you wish to re-train the model again and make changes to its hyperparameter (or use another classifier) then...
87
+ * Modify `src/logistic_regression_model.py` script
88
+ * execute below commands from terminal/command-prompt
89
+ ```
90
+ pip install -r requirements.txt
91
+ streamlit run src/app.py
92
+ ```
artifacts/about.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Common NLP Tasks
2
+ ***This app demonstrates common NLP techniques and use cases.***
3
+ >This app demonstrate typical NLP techniques used in real-world use cases, such as NER & POS Recognition, Text Summarization, and Text Classification.
4
+
5
+ * **`NER`** has applications in various industries for example in `finance` it can extract essential information from earnings reports, financial statements, news articles, and product mentions for automated analysis, fraud detection, and investment opportunities. In `media and entertainment`, it analyzes text for content creation and personalized content creation. In `e-commerce`, it extracts product information from reviews, customer feedback, and descriptions, enabling automated analysis and personalized recommendations.
6
+
7
+ * **`POS`** tagging is an essential NLP technique used in machine translation, word meaning disambiguation, question answering parsing, and so on.
8
+
9
+ * **`Text Summarization`** has plethora of use-cases in real world. For example `Media monitoring` for sensitive and objectional content, `Helping disabled people` in presenting only short and relevant content, producing succinct summary of `Meetings and video-conferencing`, Summarization of financial documents like earning reports and financial news to quickly derive market signals etc.
10
+
11
+ * **`Text Classification`** has multitude of applications such as `Categorizing customer support tickets` (billing, feedback, questions complaints etc), `sentiment analysis` (customer feedback, tweets etc), `Content moderation` (hate speech, obscene language, NSFW etc).
12
+
13
+
14
+ # App UI Details
15
+ The app has four tabs: "ABOUT", "NER & POS", "TEXT SUMMARIZATION", and "TEXT CLASSIFICATION".
16
+
17
+ ## ABOUT Tab
18
+ This page
19
+
20
+ ## NER & POS Tab
21
+ Given a text fragment, named entities (NER) and parts of speech (POS) in the text can be extracted with a click of button:
22
+
23
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/ner.png?raw=true"/><br>
24
+
25
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/pos.png?raw=true"/>
26
+
27
+
28
+ Because of hardware resource constraints on public cloud hosting, the app uses a "small" language model to illustrate functionality that is far from ideal. A bigger model running on more capable hardware will yield much better results.
29
+
30
+ ## TEXT SUMMARIZATION Tab
31
+ A brief text summary is generated from a specified text. The summation technique (TextRankSummarizer, LexRankSummarizer, LsaSummarizer) and the length of the summary text may be selected by the user.
32
+
33
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/summ1.png?raw=true"/>
34
+
35
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/summ2.png?raw=true"/>
36
+
37
+ ## TEXT CLASSIFICATION Tab
38
+ The text classifier can determine the `emotion` portrayed by a sentence or paragraph given a sentence or paragraph. The `LogisticRegression` classifier is used to detect emotions in this app. The classifier was trained using labeled data from 34,000 samples.
39
+
40
+ <img src="https://github.com/sssingh/nlp_ner_summarization_classification/blob/main/assets/emotion.png?raw=true"/>
41
+
42
+ Please keep in mind that this is far from flawless. Given the training and inference hardware restrictions, the corpus utilized for training is tiny, and the model employed is basic. Training on a much bigger text corpus and employing a model capable of classifying non-linear data (e.g., XGBoost, RandomForest, or a Neural Network) would provide significantly superior results.
43
+
44
+
45
+ # Project Source
46
+ [👉 Visit GitHub Repo](https://github.com/sssingh/nlp_ner_summarization_classification)
47
+
48
+ # Contact Me
49
+ [![email](https://img.shields.io/badge/Gmail-D14836?style=for-the-badge&logo=gmail&logoColor=white)](mailto:[email protected])
50
+ [![twitter](https://img.shields.io/badge/twitter-1DA1F2?style=for-the-badge&logo=twitter&logoColor=white)](https://twitter.com/@thesssingh)
51
+ [![linkedin](https://img.shields.io/badge/linkedin-0A66C2?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/sssingh/)
52
+ [![website](https://img.shields.io/badge/web_site-8B5BE8?style=for-the-badge&logo=ko-fi&logoColor=white)](https://sunilssingh.me)
53
+
54
+ # Appendix
55
+
56
+ ## Local Installation and Run
57
+ To run the app locally...
58
+ 1. Create a conda or virtual environment and activate it
59
+ 2. install python 3.11.0 or above
60
+ 3. execute below commands from terminal/command-prompt
61
+ ```
62
+ git clone https://github.com/sssingh/nlp_ner_summarization_classification
63
+ cd nlp_ner_summarization_classification
64
+ pip install -r requirements.txt
65
+ streamlit run src/app.py
66
+ ```
67
+ 4. Open any browser and then visit `localhost:8501`
68
+
69
+ NOTE: The trained text classifier is kept in `artifacts` folder as `logistic_regression_model.joblib` file. If you wish to re-train the model again and make changes to its hyperparameter (or use another classifier) then...
70
+ * Modify `src/logistic_regression_model.py` script
71
+ * execute below commands from terminal/command-prompt
72
+ ```
73
+ pip install -r requirements.txt
74
+ streamlit run src/app.py
75
+ ```
artifacts/lr_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:265aaad101032c44519b05554897b2963a795f737da0a2c2e8b4a1d3a2e58a20
3
+ size 2697989
assets/NLP.png ADDED

Git LFS Details

  • SHA256: a1f0fddba6282fb359b3e895d844c706964ec777cbe5f731214209c488d25900
  • Pointer size: 130 Bytes
  • Size of remote file: 20.7 kB
assets/emotion.png ADDED

Git LFS Details

  • SHA256: 1f9e57b25fc05873184f3f52fc333b08ca63fcd2265a66773aa0095bfb2fe825
  • Pointer size: 130 Bytes
  • Size of remote file: 63.2 kB
assets/logo.png ADDED

Git LFS Details

  • SHA256: c0cffd7febc63ab6c296b41e28529d3f1338af1dfb13c1321de6e2dd5b4bfddc
  • Pointer size: 129 Bytes
  • Size of remote file: 1.88 kB
assets/ner.png ADDED

Git LFS Details

  • SHA256: 9b5ae58c05cebbd2524bfbcc27e9358c96dd8ab83f363866b2851c8a19898954
  • Pointer size: 130 Bytes
  • Size of remote file: 72.3 kB
assets/pos.png ADDED

Git LFS Details

  • SHA256: b66d4c7f97f775ca3ae9edd3fcbf1de491766e33b11b597df2656a1c6fa733a3
  • Pointer size: 130 Bytes
  • Size of remote file: 46.4 kB
assets/summ1.png ADDED

Git LFS Details

  • SHA256: b4e6a78ffa4b8e138d4ab16bc85caba616f8ec2e38bf83c3969a23e25b89aa3f
  • Pointer size: 131 Bytes
  • Size of remote file: 140 kB
assets/summ2.png ADDED

Git LFS Details

  • SHA256: 5300976cfaa26d92660f8c5593d458f98d6e3cc9539d160dc899839fb52e295e
  • Pointer size: 130 Bytes
  • Size of remote file: 86.9 kB
assets/title.png ADDED

Git LFS Details

  • SHA256: 11bf67d484d48636fdc871f8a7d03cbbaea9378f99ae4df85e817a83210b0726
  • Pointer size: 130 Bytes
  • Size of remote file: 22.7 kB
data/emotions.csv ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/experiments.ipynb ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 106,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import subprocess\n",
10
+ "import spacy\n",
11
+ "from sumy.parsers.plaintext import PlaintextParser\n",
12
+ "from sumy.nlp.tokenizers import Tokenizer"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 94,
18
+ "metadata": {},
19
+ "outputs": [
20
+ {
21
+ "name": "stdout",
22
+ "output_type": "stream",
23
+ "text": [
24
+ "Collecting en-core-web-sm==3.5.0\n",
25
+ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)\n",
26
+ "\u001b[2K ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 3.9 MB/s eta 0:00:00\n",
27
+ "\u001b[?25hRequirement already satisfied: spacy<3.6.0,>=3.5.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from en-core-web-sm==3.5.0) (3.5.3)\n",
28
+ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.12)\n",
29
+ "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.4)\n",
30
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.7)\n",
31
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.6)\n",
32
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.6)\n",
33
+ "Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.10)\n",
34
+ "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.9.1)\n",
35
+ "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.4.6)\n",
36
+ "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.7)\n",
37
+ "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.4.1)\n",
38
+ "Requirement already satisfied: pathy>=0.10.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.10.1)\n",
39
+ "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (5.2.1)\n",
40
+ "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.65.0)\n",
41
+ "Requirement already satisfied: numpy>=1.15.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.24.3)\n",
42
+ "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.29.0)\n",
43
+ "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.10.8)\n",
44
+ "Requirement already satisfied: jinja2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.1.2)\n",
45
+ "Requirement already satisfied: setuptools in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (67.8.0)\n",
46
+ "Requirement already satisfied: packaging>=20.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (23.0)\n",
47
+ "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.3.0)\n",
48
+ "Requirement already satisfied: typing-extensions>=4.2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.7.1)\n",
49
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.4)\n",
50
+ "Requirement already satisfied: idna<4,>=2.5 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.4)\n",
51
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.26.16)\n",
52
+ "Requirement already satisfied: certifi>=2017.4.17 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2023.7.22)\n",
53
+ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.9)\n",
54
+ "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.0.4)\n",
55
+ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.0.4)\n",
56
+ "Requirement already satisfied: MarkupSafe>=2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.1.1)\n",
57
+ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
58
+ "You can now load the package via spacy.load('en_core_web_sm')\n"
59
+ ]
60
+ },
61
+ {
62
+ "data": {
63
+ "text/plain": [
64
+ "CompletedProcess(args=['python', '-m', 'spacy', 'download', 'en_core_web_sm'], returncode=0)"
65
+ ]
66
+ },
67
+ "execution_count": 94,
68
+ "metadata": {},
69
+ "output_type": "execute_result"
70
+ }
71
+ ],
72
+ "source": [
73
+ "subprocess.run([\"python\", \"-m\", \"spacy\", \"download\", \"en_core_web_sm\"])"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 95,
79
+ "metadata": {},
80
+ "outputs": [
81
+ {
82
+ "data": {
83
+ "text/plain": [
84
+ "This is an example text in Singapore by Sunil Singh on 6th August 2023"
85
+ ]
86
+ },
87
+ "execution_count": 95,
88
+ "metadata": {},
89
+ "output_type": "execute_result"
90
+ }
91
+ ],
92
+ "source": [
93
+ "nlp = spacy.load('en_core_web_sm')\n",
94
+ "doc = nlp(\"This is an example text in Singapore by Sunil Singh on 6th August 2023\")\n",
95
+ "doc"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 96,
101
+ "metadata": {},
102
+ "outputs": [
103
+ {
104
+ "data": {
105
+ "text/plain": [
106
+ "'<!DOCTYPE html>\\n<html lang=\"en\">\\n <head>\\n <title>displaCy</title>\\n </head>\\n\\n <body style=\"font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, \\'Segoe UI\\', Helvetica, Arial, sans-serif, \\'Apple Color Emoji\\', \\'Segoe UI Emoji\\', \\'Segoe UI Symbol\\'; padding: 4rem 2rem; direction: ltr\">\\n<figure style=\"margin-bottom: 6rem\">\\n<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">This is an example text in \\n<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\\n Singapore\\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\\n</mark>\\n by \\n<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\\n Sunil Singh\\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\\n</mark>\\n on \\n<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\\n 6th August 2023\\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\\n</mark>\\n</div>\\n</figure>\\n</body>\\n</html>'"
107
+ ]
108
+ },
109
+ "execution_count": 96,
110
+ "metadata": {},
111
+ "output_type": "execute_result"
112
+ }
113
+ ],
114
+ "source": [
115
+ "ner_html = displacy.render(docs=doc, style=\"ent\", jupyter=False, page=True)\n",
116
+ "ner_html"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 97,
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "data": {
126
+ "text/html": [
127
+ "<div>\n",
128
+ "<style scoped>\n",
129
+ " .dataframe tbody tr th:only-of-type {\n",
130
+ " vertical-align: middle;\n",
131
+ " }\n",
132
+ "\n",
133
+ " .dataframe tbody tr th {\n",
134
+ " vertical-align: top;\n",
135
+ " }\n",
136
+ "\n",
137
+ " .dataframe thead th {\n",
138
+ " text-align: right;\n",
139
+ " }\n",
140
+ "</style>\n",
141
+ "<table border=\"1\" class=\"dataframe\">\n",
142
+ " <thead>\n",
143
+ " <tr style=\"text-align: right;\">\n",
144
+ " <th></th>\n",
145
+ " <th>Entity Code</th>\n",
146
+ " <th>Entity Description</th>\n",
147
+ " </tr>\n",
148
+ " </thead>\n",
149
+ " <tbody>\n",
150
+ " <tr>\n",
151
+ " <th>0</th>\n",
152
+ " <td>DATE</td>\n",
153
+ " <td>Absolute or relative dates or periods</td>\n",
154
+ " </tr>\n",
155
+ " <tr>\n",
156
+ " <th>1</th>\n",
157
+ " <td>GPE</td>\n",
158
+ " <td>Countries, cities, states</td>\n",
159
+ " </tr>\n",
160
+ " </tbody>\n",
161
+ "</table>\n",
162
+ "</div>"
163
+ ],
164
+ "text/plain": [
165
+ " Entity Code Entity Description\n",
166
+ "0 DATE Absolute or relative dates or periods\n",
167
+ "1 GPE Countries, cities, states"
168
+ ]
169
+ },
170
+ "execution_count": 97,
171
+ "metadata": {},
172
+ "output_type": "execute_result"
173
+ }
174
+ ],
175
+ "source": [
176
+ "import pandas as pd\n",
177
+ "label, desc = [],[]\n",
178
+ "for ent in doc.ents:\n",
179
+ " label.append(ent.label_)\n",
180
+ " desc.append(spacy.explain(ent.label_))\n",
181
+ "label, desc = list(set(label)), list(set(desc))\n",
182
+ "df = pd.DataFrame(data={\"Entity Code\":label, \"Entity Description\":desc})\n",
183
+ "df"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": 98,
189
+ "metadata": {},
190
+ "outputs": [
191
+ {
192
+ "data": {
193
+ "text/plain": [
194
+ "(Singapore, Sunil Singh, 6th August 2023)"
195
+ ]
196
+ },
197
+ "execution_count": 98,
198
+ "metadata": {},
199
+ "output_type": "execute_result"
200
+ }
201
+ ],
202
+ "source": [
203
+ "doc.ents"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": 99,
209
+ "metadata": {},
210
+ "outputs": [
211
+ {
212
+ "name": "stdout",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "This PRON pronoun\n",
216
+ "is AUX auxiliary\n",
217
+ "an DET determiner\n",
218
+ "example NOUN noun\n",
219
+ "text NOUN noun\n",
220
+ "in ADP adposition\n",
221
+ "Singapore PROPN proper noun\n",
222
+ "by ADP adposition\n",
223
+ "Sunil PROPN proper noun\n",
224
+ "Singh PROPN proper noun\n",
225
+ "on ADP adposition\n",
226
+ "6th ADJ adjective\n",
227
+ "August PROPN proper noun\n",
228
+ "2023 NUM numeral\n"
229
+ ]
230
+ }
231
+ ],
232
+ "source": [
233
+ "for token in doc:\n",
234
+ " print(token.text, token.pos_, spacy.explain(token.pos_))"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": 100,
240
+ "metadata": {},
241
+ "outputs": [
242
+ {
243
+ "data": {
244
+ "text/html": [
245
+ "<div>\n",
246
+ "<style scoped>\n",
247
+ " .dataframe tbody tr th:only-of-type {\n",
248
+ " vertical-align: middle;\n",
249
+ " }\n",
250
+ "\n",
251
+ " .dataframe tbody tr th {\n",
252
+ " vertical-align: top;\n",
253
+ " }\n",
254
+ "\n",
255
+ " .dataframe thead th {\n",
256
+ " text-align: right;\n",
257
+ " }\n",
258
+ "</style>\n",
259
+ "<table border=\"1\" class=\"dataframe\">\n",
260
+ " <thead>\n",
261
+ " <tr style=\"text-align: right;\">\n",
262
+ " <th></th>\n",
263
+ " <th>Token</th>\n",
264
+ " <th>Tag</th>\n",
265
+ " <th>Pos</th>\n",
266
+ " <th>Description</th>\n",
267
+ " </tr>\n",
268
+ " </thead>\n",
269
+ " <tbody>\n",
270
+ " <tr>\n",
271
+ " <th>0</th>\n",
272
+ " <td>example</td>\n",
273
+ " <td>NN</td>\n",
274
+ " <td>NOUN</td>\n",
275
+ " <td>noun, singular or mass</td>\n",
276
+ " </tr>\n",
277
+ " <tr>\n",
278
+ " <th>1</th>\n",
279
+ " <td>text</td>\n",
280
+ " <td>NN</td>\n",
281
+ " <td>NOUN</td>\n",
282
+ " <td>noun, singular or mass</td>\n",
283
+ " </tr>\n",
284
+ " <tr>\n",
285
+ " <th>2</th>\n",
286
+ " <td>Singapore</td>\n",
287
+ " <td>NNP</td>\n",
288
+ " <td>PROPN</td>\n",
289
+ " <td>noun, proper singular</td>\n",
290
+ " </tr>\n",
291
+ " <tr>\n",
292
+ " <th>3</th>\n",
293
+ " <td>Sunil</td>\n",
294
+ " <td>NNP</td>\n",
295
+ " <td>PROPN</td>\n",
296
+ " <td>noun, proper singular</td>\n",
297
+ " </tr>\n",
298
+ " <tr>\n",
299
+ " <th>4</th>\n",
300
+ " <td>Singh</td>\n",
301
+ " <td>NNP</td>\n",
302
+ " <td>PROPN</td>\n",
303
+ " <td>noun, proper singular</td>\n",
304
+ " </tr>\n",
305
+ " <tr>\n",
306
+ " <th>5</th>\n",
307
+ " <td>6th</td>\n",
308
+ " <td>JJ</td>\n",
309
+ " <td>ADJ</td>\n",
310
+ " <td>adjective (English), other noun-modifier (Chin...</td>\n",
311
+ " </tr>\n",
312
+ " <tr>\n",
313
+ " <th>6</th>\n",
314
+ " <td>August</td>\n",
315
+ " <td>NNP</td>\n",
316
+ " <td>PROPN</td>\n",
317
+ " <td>noun, proper singular</td>\n",
318
+ " </tr>\n",
319
+ " <tr>\n",
320
+ " <th>7</th>\n",
321
+ " <td>2023</td>\n",
322
+ " <td>CD</td>\n",
323
+ " <td>NUM</td>\n",
324
+ " <td>cardinal number</td>\n",
325
+ " </tr>\n",
326
+ " </tbody>\n",
327
+ "</table>\n",
328
+ "</div>"
329
+ ],
330
+ "text/plain": [
331
+ " Token Tag Pos Description\n",
332
+ "0 example NN NOUN noun, singular or mass\n",
333
+ "1 text NN NOUN noun, singular or mass\n",
334
+ "2 Singapore NNP PROPN noun, proper singular\n",
335
+ "3 Sunil NNP PROPN noun, proper singular\n",
336
+ "4 Singh NNP PROPN noun, proper singular\n",
337
+ "5 6th JJ ADJ adjective (English), other noun-modifier (Chin...\n",
338
+ "6 August NNP PROPN noun, proper singular\n",
339
+ "7 2023 CD NUM cardinal number"
340
+ ]
341
+ },
342
+ "execution_count": 100,
343
+ "metadata": {},
344
+ "output_type": "execute_result"
345
+ }
346
+ ],
347
+ "source": [
348
+ "word, tag, pos, desc = [], [], [], []\n",
349
+ "for token in doc:\n",
350
+ " if token.is_stop or token.is_punct:\n",
351
+ " continue\n",
352
+ " word.append(str(token))\n",
353
+ " tag.append(str(token.tag_))\n",
354
+ " pos.append(token.pos_)\n",
355
+ " desc.append(spacy.explain(token.tag_))\n",
356
+ "pd.DataFrame(data=dict(Token=word, Tag=tag, Pos=pos, Description=desc))\n"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "code",
361
+ "execution_count": 101,
362
+ "metadata": {},
363
+ "outputs": [],
364
+ "source": [
365
+ "import sys\n",
366
+ "from sumy.parsers.plaintext import PlaintextParser\n",
367
+ "from sumy.nlp.tokenizers import Tokenizer\n",
368
+ "from sumy.summarizers.text_rank import TextRankSummarizer\n",
369
+ "from sumy.summarizers.lex_rank import LexRankSummarizer\n",
370
+ "from sumy.summarizers.lsa import LsaSummarizer\n",
371
+ "from dataclasses import dataclass\n",
372
+ "@dataclass\n",
373
+ "class __AppConfig:\n",
374
+ " \"\"\"app-wide configurations\"\"\"\n",
375
+ " summarizers = dict(\n",
376
+ " TextRankSummarizer=\"sumy.summarizers.text_rank\",\n",
377
+ " LexRankSummarizer=\"sumy.summarizers.lex_rank\",\n",
378
+ " LsaSummarizer=\"sumy.summarizers.lsa\",\n",
379
+ " )\n",
380
+ "### make configs available to any module that imports this module\n",
381
+ "app_config = __AppConfig()"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 102,
387
+ "metadata": {},
388
+ "outputs": [],
389
+ "source": [
390
+ "def class_from_name(module, class_name):\n",
391
+ " return getattr(module, class_name)"
392
+ ]
393
+ },
394
+ {
395
+ "cell_type": "code",
396
+ "execution_count": 103,
397
+ "metadata": {},
398
+ "outputs": [],
399
+ "source": [
400
+ "method=\"TextRankSummarizer\"\n",
401
+ "def get_summarizer(method):\n",
402
+ " module=sys.modules[app_config.summarizers.get(method)]\n",
403
+ " summarizer = class_from_name(module, method)\n",
404
+ " return summarizer"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": 108,
410
+ "metadata": {},
411
+ "outputs": [],
412
+ "source": [
413
+ "text = \"\"\"Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.\n",
414
+ "Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. Interstellar uses extensive practical and miniature effects, and the company Double Negative created additional digital effects.\"\"\""
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": 109,
420
+ "metadata": {},
421
+ "outputs": [
422
+ {
423
+ "data": {
424
+ "text/plain": [
425
+ "<sumy.parsers.plaintext.PlaintextParser at 0x7fa774f4a510>"
426
+ ]
427
+ },
428
+ "execution_count": 109,
429
+ "metadata": {},
430
+ "output_type": "execute_result"
431
+ }
432
+ ],
433
+ "source": [
434
+ "parser = PlaintextParser.from_string(text, Tokenizer(\"english\"))\n",
435
+ "parser"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": 117,
441
+ "metadata": {},
442
+ "outputs": [
443
+ {
444
+ "data": {
445
+ "text/plain": [
446
+ "8"
447
+ ]
448
+ },
449
+ "execution_count": 117,
450
+ "metadata": {},
451
+ "output_type": "execute_result"
452
+ }
453
+ ],
454
+ "source": [
455
+ "parser.document.sentences"
456
+ ]
457
+ }
458
+ ],
459
+ "metadata": {
460
+ "kernelspec": {
461
+ "display_name": "nlp",
462
+ "language": "python",
463
+ "name": "python3"
464
+ },
465
+ "language_info": {
466
+ "codemirror_mode": {
467
+ "name": "ipython",
468
+ "version": 3
469
+ },
470
+ "file_extension": ".py",
471
+ "mimetype": "text/x-python",
472
+ "name": "python",
473
+ "nbconvert_exporter": "python",
474
+ "pygments_lexer": "ipython3",
475
+ "version": "3.11.0"
476
+ },
477
+ "orig_nbformat": 4
478
+ },
479
+ "nbformat": 4,
480
+ "nbformat_minor": 2
481
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ scikit-learn==1.2.*
2
+ joblib==1.2.*
3
+ pandas==1.5.*
4
+ streamlit==1.24.*
5
+ spacy==3.5.0
6
+ sumy==0.11.*
7
+ neattext==0.1.*
8
+ plotly==5.9.*
9
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0.tar.gz
src/app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Application entry point, global configuration, application structure"""
2
+
3
+ from config import app_config
4
+ import data
5
+ import utils
6
+ import tab_about
7
+ import tab_ner
8
+ import tab_emotions as tab_emotions
9
+ import tab_summarization
10
+ import streamlit as st
11
+
12
+
13
+ def init():
14
+ ### setup app-wide configuration
15
+ utils.setup_app(app_config)
16
+
17
+ ### load data only once and cache it
18
+ nlp = data.load_lang_model(app_config.spacy_lang_model)
19
+ data.load_nltk_punkt()
20
+ df = data.load_emotions_data(app_config.emotions_data_file)
21
+
22
+ ### initialize session state
23
+
24
+ ### setup app tab structure
25
+ about, ner, summarization, sentiment = utils.create_tabs(
26
+ ["ABOUT 👋", "NER & POS 🔍", "TEXT SUMMARIZATION 📝", "TEXT CLASSIFICATION 📑"]
27
+ )
28
+ with about:
29
+ tab_about.render()
30
+ with ner:
31
+ tab_ner.render(nlp)
32
+ with summarization:
33
+ tab_summarization.render()
34
+ with sentiment:
35
+ tab_emotions.render(df)
36
+
37
+
38
+ if __name__ == "__main__":
39
+ init()
src/config.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """All app-specific user defined configurations are defined here"""
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from sumy.summarizers.text_rank import TextRankSummarizer
6
+ from sumy.summarizers.lex_rank import LexRankSummarizer
7
+ from sumy.summarizers.lsa import LsaSummarizer
8
+ import plotly.express as px
9
+
10
+
11
+ ### define all plotting configuration here,
12
+ ### should not be accessed and changed directly hence leading "__"
13
+ @dataclass
14
+ class __PlotConfig:
15
+ """All plotting configurations are defined here"""
16
+
17
+ # Available themes (templates):
18
+ # ['ggplot2', 'seaborn', 'simple_white', 'plotly',
19
+ # 'plotly_white', 'plotly_dark', 'presentation',
20
+ # 'xgridoff', 'ygridoff', 'gridon', 'none']
21
+ theme = "plotly_dark"
22
+ cat_color_map = px.colors.qualitative.T10
23
+ cat_color_map_r = px.colors.qualitative.T10_r
24
+ cont_color_map = px.colors.sequential.amp
25
+ cont_color_map_r = px.colors.sequential.amp_r
26
+
27
+
28
+ ### define all app-wide configuration here,
29
+ ### should not be accessed and changed directly hence leading "__"
30
+ @dataclass
31
+ class __AppConfig:
32
+ """app-wide configurations"""
33
+
34
+ # get current working directory
35
+ cwd = os.getcwd()
36
+ banner_image_file = f"{cwd}/"
37
+ logo_image_file = f"{cwd}/assets/logo.png"
38
+ app_icon_file = f"{cwd}/assets/NLP.png"
39
+ app_title = "Applications"
40
+ readme_file_path = f"{cwd}/artifacts/about.md"
41
+ app_short_desc = "For common NLP use cases"
42
+ emotions_data_file = f"{cwd}/data/emotions.csv"
43
+ emoji_map = {
44
+ "joy": "😃",
45
+ "anger": "😡",
46
+ "disgust": "🤮",
47
+ "fear": "😨",
48
+ "neutral": "😐",
49
+ "sadness": "😔",
50
+ "shame": "🫣",
51
+ "surprise": "😲",
52
+ }
53
+ model_file = f"{cwd}/artifacts/lr_model.joblib"
54
+ sidebar_state = "expanded" # collapsed
55
+ layout = "centered" # wide
56
+ icon_question = "❓"
57
+ icon_important = "🎯"
58
+ icon_info = "ℹ️"
59
+ icon_stop = "⛔"
60
+ icon_about = "👋"
61
+ spacy_lang_model = "en_core_web_sm"
62
+ # sumy summarizers
63
+ summarizers = dict(
64
+ TextRankSummarizer={
65
+ "module": "sumy.summarizers.text_rank",
66
+ "desc": (
67
+ "**`TextRank`** is a graph based ranking algorithm. Read this article"
68
+ + " https://blogs.cornell.edu/info2040/2018/10/22/40068/"
69
+ + " to get a good intuition behind it"
70
+ ),
71
+ },
72
+ LexRankSummarizer={
73
+ "module": "sumy.summarizers.lex_rank",
74
+ "desc": (
75
+ "**`LexRank`** is another graph based ranking algorithm. Read this"
76
+ + " https://github.com/crabcamp/lexrank"
77
+ + " to get a good intuition behind it"
78
+ ),
79
+ },
80
+ LsaSummarizer={
81
+ "module": "sumy.summarizers.lsa",
82
+ "desc": (
83
+ "**`LSA`** or Latent Semantic Analysis uses word frequency and Singular"
84
+ + " Value Decomposition (SVD). Read this"
85
+ + " https://www.analyticsvidhya.com/blog/2021/09/latent-semantic-analysis-and-its-uses-in-natural-language-processing/ article"
86
+ + " to get a good intuition behind it"
87
+ ),
88
+ },
89
+ )
90
+
91
+
92
+ ### make configs available to any module that imports this module
93
+ app_config = __AppConfig()
94
+ plot_config = __PlotConfig
src/data.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """All app-specific data and disk-IO related functionality implemented here"""
2
+
3
+ import subprocess
4
+ import joblib
5
+ import pandas as pd
6
+ import neattext.functions as nfx
7
+ import nltk
8
+ import spacy
9
+ import streamlit as st
10
+
11
+
12
+ @st.cache_resource
13
+ def load_lang_model(model):
14
+ """Download and then instantiate then language model"""
15
+ # subprocess.run(["python", "-m", "spacy", "download", model])
16
+ nlp = spacy.load(model)
17
+ return nlp
18
+
19
+
20
+ @st.cache_resource
21
+ def load_nltk_punkt():
22
+ """Downloads NLTK tokenizers"""
23
+ nltk.download("punkt")
24
+
25
+
26
+ @st.cache_resource
27
+ def load_emotions_data(data_file_path):
28
+ """Reads a given data-file and returns a DataFrame"""
29
+ return pd.read_csv(data_file_path)
30
+
31
+
32
+ def preprocess_data(df):
33
+ """Cleans and transforms data"""
34
+ df["Clean_Text"] = df["Text"].apply(nfx.remove_userhandles)
35
+ df["Clean_Text"] = df["Clean_Text"].apply(nfx.remove_stopwords)
36
+ df["Clean_Text"] = df["Clean_Text"].apply(nfx.remove_urls)
37
+ df["Clean_Text"] = df["Clean_Text"].apply(nfx.remove_punctuations)
38
+ return df
39
+
40
+
41
+ def preprocess_pred_data(input_data):
42
+ input_data = nfx.remove_userhandles(input_data)
43
+ input_data = nfx.remove_stopwords(input_data)
44
+ input_data = nfx.remove_urls(input_data)
45
+ input_data = nfx.remove_punctuations(input_data)
46
+ return [input_data]
47
+
48
+
49
+ def save_model(model_obj, model_file_path):
50
+ joblib.dump(value=model_obj, filename=model_file_path)
51
+
52
+
53
+ @st.cache_resource
54
+ def load_model(model_file_path):
55
+ return joblib.load(model_file_path)
src/emotion_classification.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import data
2
+ import config
3
+ from sklearn.pipeline import Pipeline
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.model_selection import train_test_split
7
+
8
+
9
+ def __train_model(df, full=False):
10
+ ### train on full data for final model else split the data and then train
11
+ if full:
12
+ X_train = df["Clean_Text"]
13
+ y_train = df["Emotion"]
14
+ else:
15
+ X_train, X_test, y_train, y_test = train_test_split(
16
+ df["Clean_Text"], df["Emotion"], test_size=0.2, random_state=42
17
+ )
18
+ ### build model pipeline
19
+ lr_pipeline = Pipeline(
20
+ steps=[("cv", CountVectorizer()), ("lr", LogisticRegression(max_iter=300))]
21
+ )
22
+ ### train and test the model
23
+ print(f"\nTraining LogisticRegression with {X_train.shape[0]} samples...")
24
+ lr_pipeline.fit(X_train, y_train)
25
+ if not full:
26
+ print(f"Testing LogisticRegression with {X_test.shape[0]} samples...")
27
+ score = lr_pipeline.score(X_test, y_test)
28
+ print(f"Accuracy achieved: [{score*100:.2f}%].")
29
+ return lr_pipeline
30
+
31
+
32
+ if __name__ == "__main__":
33
+ emotions_df = data.load_emotions_data(config.app_config.emotions_data_file)
34
+ emotions_df = data.preprocess_data(emotions_df)
35
+ model = __train_model(emotions_df, full=True)
36
+ data.save_model(model, config.app_config.model_file)
37
+ print(f"Saved model to: [{config.app_config.model_file}]")
38
+
39
+ ### Test code
40
+ # model = data.load_model(config.app_config.model_file)
41
+ # test_text = "I am loving NLP and it makes me feel so good"
42
+ # print(f"\nTesting model with sample text '{test_text}'\nPrediction:")
43
+ # print(model.predict([test_text]))
44
+ # print(model.classes_)
45
+ # print(model.predict_proba([test_text]))
src/plot.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from config import plot_config
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.io as io
5
+
6
+ # setup app-wide plotly theme
7
+ io.templates.default = plot_config.theme
8
+
9
+
10
+ def plot_proba(classes, proba):
11
+ df_proba = pd.DataFrame({"Emotions": classes, "Probability": proba})
12
+ df_proba["Emotions"] = df_proba["Emotions"].str.upper()
13
+ df_proba = df_proba.sort_values(by="Probability", ascending=False)
14
+ fig = px.bar(
15
+ data_frame=df_proba,
16
+ x="Probability",
17
+ y="Emotions",
18
+ color="Emotions",
19
+ title="Prediction Probabilities",
20
+ color_discrete_sequence=plot_config.cat_color_map,
21
+ )
22
+ return fig
23
+
24
+
25
+ def plot_class_dist(df):
26
+ df_count = pd.DataFrame(df["Emotion"].value_counts()).reset_index()
27
+ df_count.columns = ["Emotions", "Count"]
28
+ df_count["Emotions"] = df_count["Emotions"].str.upper()
29
+ fig = px.bar(
30
+ data_frame=df_count,
31
+ x="Emotions",
32
+ y="Count",
33
+ color="Emotions",
34
+ title="Class Distribution",
35
+ color_discrete_sequence=plot_config.cat_color_map,
36
+ )
37
+ return fig
src/tab_about.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """About tab rendering functionality"""
2
+
3
+ import streamlit as st
4
+ from config import app_config
5
+
6
+
7
+ ###
8
+ ### INTERNAL FUNCTIONS
9
+ ###
10
+ def __section(header):
11
+ """Render the section on this page"""
12
+ st.header(header)
13
+ with open(app_config.readme_file_path, "r") as f:
14
+ about = f.read()
15
+ st.markdown(about, unsafe_allow_html=True)
16
+
17
+
18
+ ###
19
+ ### MAIN FLOW, entry point
20
+ ###
21
+ def render():
22
+ __section("About The App")
src/tab_emotions.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Summary tab rendering functionality"""
2
+
3
+ from config import app_config
4
+ import plot
5
+ import streamlit as st
6
+ import data
7
+ import utils
8
+ from config import app_config
9
+
10
+
11
+ ###
12
+ ### INTERNAL FUNCTIONS
13
+ ###
14
+ def __section(header, df):
15
+ """Build page UI elements"""
16
+ st.header(header)
17
+ ### accept text input, make prediction and show results
18
+ st.write(
19
+ "`Enter the text` to be classified in the text area and then click `Detect`"
20
+ )
21
+ text = st.text_area("Enter Text:", height=200)
22
+ if st.button("Predict"):
23
+ model = data.load_model(app_config.model_file)
24
+ pred, pred_proba = utils.make_prediction(model, text, proba=True)
25
+ pred_col, conf_col = st.columns(2)
26
+ with pred_col:
27
+ emotion = pred[0]
28
+ st.success(
29
+ f"Detected Emotion: {emotion.upper()} {app_config.emoji_map[emotion]}"
30
+ )
31
+
32
+ with conf_col:
33
+ st.success(f"Confidence: {pred_proba.max():.2f}%")
34
+ fig = plot.plot_proba(model.classes_, pred_proba)
35
+ st.plotly_chart(fig, use_container_width=True)
36
+
37
+ ### Supplementary details about the model used
38
+ st.divider()
39
+ with st.expander("Supplementary under-the-hood details:"):
40
+ st.info(
41
+ body="""
42
+ A trained LogisticRegression model is used here for emotion detection. The model
43
+ has been trained on a labeled data of 34,000 samples. Sample data and class
44
+ distribution is shown below.
45
+ """,
46
+ icon=app_config.icon_info,
47
+ )
48
+ st.dataframe(df.loc[:15, ["Clean_Text", "Emotion"]])
49
+ fig = plot.plot_class_dist(df)
50
+ st.plotly_chart(fig, use_container_width=True)
51
+
52
+
53
+ ###
54
+ ### MAIN FLOW, entry point
55
+ ###
56
+ def render(df):
57
+ __section("Emotions Detection", df)
src/tab_ner.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Summary tab rendering functionality"""
2
+ import pandas as pd
3
+ import streamlit as st
4
+ import spacy
5
+ from spacy import displacy
6
+
7
+
8
+ ###
9
+ ### INTERNAL FUNCTIONS
10
+ ###
11
+ def __analyse(text, analysis_type, nlp):
12
+ """Analyse and return the named-entity for the given text"""
13
+ doc = nlp(text)
14
+ ### analyse based on type
15
+ if analysis_type == "NER":
16
+ heading = "Named Entity Recognition (NER)"
17
+ result = displacy.render(docs=doc, style="ent", jupyter=False)
18
+ label, desc = [], []
19
+ for ent in doc.ents:
20
+ label.append(ent.label_)
21
+ desc.append(spacy.explain(ent.label_))
22
+ df = pd.DataFrame(data={"Codes": label, "Description": desc})
23
+ df = df.drop_duplicates().reset_index()
24
+ elif analysis_type == "POS":
25
+ result = ""
26
+ word, tag, pos, desc = [], [], [], []
27
+ for token in doc:
28
+ if token.is_stop or token.is_punct:
29
+ continue
30
+ word.append(str(token))
31
+ tag.append(str(token.tag_))
32
+ pos.append(token.pos_)
33
+ desc.append(spacy.explain(token.tag_))
34
+ df = pd.DataFrame(data=dict(Token=word, Tag=tag, Pos=pos, Description=desc))
35
+ heading = "Parts of speech tagging (POS)"
36
+ return result, df, heading
37
+
38
+
39
+ def __section(header, nlp):
40
+ """Build page UI elements"""
41
+ st.header(header)
42
+ st.write(
43
+ "Choose the analysis-type (NER/POS) to be performed, "
44
+ + "enter the text in the text area and then click Analyse"
45
+ )
46
+ analysis_type = st.radio(label="Type:", options=["NER", "POS"])
47
+ text = st.text_area("Enter text:", height=300)
48
+ ### analyse the entered text and show the results
49
+ if st.button("Analyse"):
50
+ result, df, heading = __analyse(text, analysis_type, nlp)
51
+ st.subheader(heading)
52
+ st.divider()
53
+ st.write(result, unsafe_allow_html=True)
54
+ st.write(" ")
55
+ st.dataframe(df, use_container_width=True)
56
+ st.divider()
57
+
58
+
59
+ ###
60
+ ### MAIN FLOW, entry point
61
+ ###
62
+ def render(nlp):
63
+ """NER tab page"""
64
+ __section("Named Entity & Parts Of Speech Recognition", nlp)
src/tab_summarization.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Summary tab rendering functionality"""
2
+
3
+ from config import app_config
4
+ import utils
5
+ import sys
6
+ import streamlit as st
7
+ from sumy.parsers.plaintext import PlaintextParser
8
+ from sumy.nlp.tokenizers import Tokenizer
9
+ from sumy.evaluation import rouge_n
10
+
11
+
12
+ ###
13
+ ### INTERNAL FUNCTIONS
14
+ ###
15
+ def __get_summarizer(summarizer_type):
16
+ """Helper to get summarizer object given its name as string"""
17
+ summarizer_dict = app_config.summarizers.get(summarizer_type)
18
+ module = sys.modules[summarizer_dict["module"]]
19
+ summarizer = utils.get_class_from_name(module, summarizer_type)
20
+ desc = summarizer_dict["desc"]
21
+ return summarizer(), desc
22
+
23
+
24
+ def __summarize(text, summarizer, n_sentences):
25
+ ### instantiate the text parser, summarize text and return the summary text
26
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
27
+ summary_tuple = summarizer(parser.document, n_sentences)
28
+ summary_text = ""
29
+ for sentence in summary_tuple:
30
+ summary_text += str(sentence)
31
+ ### compute length of sentences are ROUGE score for summary text
32
+ rouge = rouge_n(
33
+ evaluated_sentences=summary_tuple,
34
+ reference_sentences=parser.document.sentences,
35
+ n=2,
36
+ )
37
+ stats = f"""
38
+ Number of sentences in original text: **{len(parser.document.sentences)}**
39
+ Number of sentences in summary text: **{len(summary_tuple)}**
40
+ ROUGE (bi-gram) score: **{rouge}**
41
+ """
42
+ return summary_text, stats
43
+
44
+
45
+ def __section(header):
46
+ """Build page UI elements"""
47
+ st.header(header)
48
+ st.write(
49
+ "Choose the `Summarization Method`, `Enter Text` in the text "
50
+ + "area, choose the `Number Of Sentences` required in summary text "
51
+ + "and then click `Summarize`"
52
+ )
53
+ summarizer_type = st.radio(
54
+ "Summarization Method:",
55
+ options=[
56
+ # "WordFrequency",
57
+ "TextRankSummarizer",
58
+ "LexRankSummarizer",
59
+ "LsaSummarizer",
60
+ ],
61
+ )
62
+ ### Based on type selected, fetch the summarizer object and show short description
63
+ summarizer, desc = __get_summarizer(summarizer_type)
64
+ st.info(body=f"{desc}", icon=app_config.icon_info)
65
+ text = st.text_area("Enter text:", height=300, key="summarization")
66
+ n_sentences = st.slider(
67
+ label="Number Of Sentences", min_value=1, max_value=10, value=3
68
+ )
69
+ ### summarize the entered text and show the results
70
+ if st.button("Summarize"):
71
+ summary, stats = __summarize(text, summarizer, n_sentences)
72
+ st.divider()
73
+ st.subheader("Summary")
74
+ st.success(stats)
75
+ st.write(summary)
76
+ st.divider()
77
+
78
+
79
+ ###
80
+ ### MAIN FLOW, entry point
81
+ ###
82
+ def render():
83
+ __section("Text Summarization")
src/utils.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """App agnostic reusable utility functionality"""
2
+
3
+ from config import app_config
4
+ import data
5
+ from typing import List
6
+ from PIL import Image
7
+ import streamlit as st
8
+
9
+
10
+ def setup_app(config):
11
+ """Sets up all application icon, banner, title"""
12
+ st.set_page_config(
13
+ page_title=config.app_title,
14
+ page_icon=app_config.app_icon_file,
15
+ initial_sidebar_state=config.sidebar_state,
16
+ layout=config.layout,
17
+ )
18
+ ### Logo and App title, description
19
+ with st.container():
20
+ app_icon, app_title, logo = st.columns([0.2, 0.9, 0.3])
21
+ app_icon.image(image=app_config.app_icon_file, width=80)
22
+ app_title.markdown(
23
+ f"<h1 style='text-align: left; color: #03989e;'>{app_config.app_title}</h1> ",
24
+ unsafe_allow_html=True,
25
+ )
26
+ app_title.markdown(
27
+ f"<p style='text-align: left;'>{app_config.app_short_desc}</p>",
28
+ unsafe_allow_html=True,
29
+ )
30
+ logo.image(image=app_config.logo_image_file, width=100)
31
+
32
+
33
+ def create_tabs(tabs: List[str]):
34
+ """Creates streamlit tabs"""
35
+ return st.tabs(tabs)
36
+
37
+
38
+ def download_file(btn_label, data, file_name, mime_type):
39
+ """Creates a download button for data download"""
40
+ st.download_button(label=btn_label, data=data, file_name=file_name, mime=mime_type)
41
+
42
+
43
+ def get_class_from_name(module: str, class_name: str):
44
+ """Instantiates and return the class given the class name and its module as str"""
45
+ return getattr(module, class_name)
46
+
47
+
48
+ def make_prediction(model, input_data, proba=False):
49
+ """
50
+ prediction pipeline for the model, model must have predict method and predict_proba
51
+ method if prediction probabilities to be returned
52
+ """
53
+ ### preprocess the input and return it in a shape suitable for this model
54
+ processed_input_data = data.preprocess_pred_data(input_data)
55
+ ### call model's predict method
56
+ pred = model.predict(processed_input_data)
57
+ ### call model's predict_proba method if required
58
+ pred_proba = []
59
+ if proba:
60
+ pred_proba = model.predict_proba(processed_input_data)
61
+ return pred, pred_proba.squeeze()