PDF2Audio

Sleeping

App Files Files Community

matsuap commited on Jun 3, 2025

Commit

63bca4f

1 Parent(s): e22b652

Refactor app.py to remove unused functions and comments, update UI labels to Japanese, and reorganize input sections for better clarity.

Browse files

Files changed (1) hide show

app.py +58 -92

app.py CHANGED Viewed

@@ -16,19 +16,6 @@ from pydantic import BaseModel, ValidationError
 from pypdf import PdfReader
 from tenacity import retry, retry_if_exception_type
-import re
-def read_readme():
-    readme_path = Path("README.md")
-    if readme_path.exists():
-        with open(readme_path, "r") as file:
-            content = file.read()
-            # Use regex to remove metadata enclosed in -- ... --
-            content = re.sub(r'--.*?--', '', content, flags=re.DOTALL)
-            return content
-    else:
-        return "README.md not found. Please check the repository for more information."
 # Define multiple sets of instruction templates
 INSTRUCTION_TEMPLATES = {
 ################# PODCAST ##################
@@ -655,16 +642,6 @@ def generate_audio(
         edited_transcript=edited_transcript_processed,
         user_feedback=user_feedback_processed
     )
-    # llm_output = generate_dialogue(
-    #     '本ガイドブックは、政府情報システム開発におけるアジャイル開発の適用を支援するために用意されたものです。',
-    #     intro_instructions='',
-    #     text_instructions='',
-    #     scratch_pad_instructions='',
-    #     prelude_dialog='',
-    #     podcast_dialog_instructions='',
-    #     edited_transcript='',
-    #     user_feedback=''
-    # )
     print('llm_output:', llm_output)
     # Generate audio from the transcript
@@ -763,108 +740,101 @@ with gr.Blocks(title="PDF to Audio", css="""
 """) as demo:
     with gr.Row(elem_id="header"):
-        with gr.Column(scale=4):
-            gr.Markdown("# Convert PDFs into an audio podcast, lecture, summary and others\n\nFirst, upload one or more PDFs, select options, then push Generate Audio.\n\nYou can also select a variety of custom option and direct the way the result is generated.", elem_id="title")
-        with gr.Column(scale=1):
-            gr.HTML('''
-                <div id="logo_container">
-                    <img src="https://huggingface.co/spaces/lamm-mit/PDF2Audio/resolve/main/logo.png" id="logo_image" alt="Logo">
-                </div>
-            ''')
     #gr.Markdown("")
-    submit_btn = gr.Button("Generate Audio", elem_id="submit_btn")
     with gr.Row(elem_id="main_container"):
         with gr.Column(scale=2):
-            files = gr.Files(label="PDFs", file_types=[], )
             openai_api_key = gr.Textbox(
-                label="OpenAI API Key",
                 visible=True,  # Always show the API key field
-                placeholder="Enter your OpenAI API Key here...",
                 type="password"  # Hide the API key input
             )
             text_model = gr.Dropdown(
-                label="Text Generation Model",
                 choices=STANDARD_TEXT_MODELS,
                 value="o1-preview-2024-09-12", #"gpt-4o-mini",
-                info="Select the model to generate the dialogue text.",
             )
             audio_model = gr.Dropdown(
-                label="Audio Generation Model",
                 choices=STANDARD_AUDIO_MODELS,
                 value="tts-1",
-                info="Select the model to generate the audio.",
             )
             speaker_1_voice = gr.Dropdown(
-                label="Speaker 1 Voice",
                 choices=STANDARD_VOICES,
                 value="alloy",
-                info="Select the voice for Speaker 1.",
             )
             speaker_2_voice = gr.Dropdown(
-                label="Speaker 2 Voice",
                 choices=STANDARD_VOICES,
                 value="echo",
-                info="Select the voice for Speaker 2.",
             )
             api_base = gr.Textbox(
-                label="Custom API Base",
-                placeholder="Enter custom API base URL if using a custom/local model...",
-                info="If you are using a custom or local model, provide the API base URL here, e.g.: http://localhost:8080/v1 for llama.cpp REST server.",
             )
         with gr.Column(scale=3):
             template_dropdown = gr.Dropdown(
-                label="Instruction Template",
                 choices=list(INSTRUCTION_TEMPLATES.keys()),
                 value="podcast",
-                info="Select the instruction template to use. You can also edit any of the fields for more tailored results.",
-            )
-            intro_instructions = gr.Textbox(
-                label="Intro Instructions",
-                lines=10,
-                value=INSTRUCTION_TEMPLATES["podcast"]["intro"],
-                info="Provide the introductory instructions for generating the dialogue.",
-            )
-            text_instructions = gr.Textbox(
-                label="Standard Text Analysis Instructions",
-                lines=10,
-                placeholder="Enter text analysis instructions...",
-                value=INSTRUCTION_TEMPLATES["podcast"]["text_instructions"],
-                info="Provide the instructions for analyzing the raw data and text.",
-            )
-            scratch_pad_instructions = gr.Textbox(
-                label="Scratch Pad Instructions",
-                lines=15,
-                value=INSTRUCTION_TEMPLATES["podcast"]["scratch_pad"],
-                info="Provide the scratch pad instructions for brainstorming presentation/dialogue content.",
             )
-            prelude_dialog = gr.Textbox(
-                label="Prelude Dialog",
-                lines=5,
-                value=INSTRUCTION_TEMPLATES["podcast"]["prelude"],
-                info="Provide the prelude instructions before the presentation/dialogue is developed.",
-            )
-            podcast_dialog_instructions = gr.Textbox(
-                label="Podcast Dialog Instructions",
-                lines=20,
-                value=INSTRUCTION_TEMPLATES["podcast"]["dialog"],
-                info="Provide the instructions for generating the presentation or podcast dialogue.",
-            )
-    audio_output = gr.Audio(label="Audio", format="mp3", interactive=False, autoplay=False)
-    transcript_output = gr.Textbox(label="Transcript", lines=20, show_copy_button=True)
-    original_text_output = gr.Textbox(label="Original Text", lines=10, visible=False)
     error_output = gr.Textbox(visible=False)  # Hidden textbox to store error message
-    use_edited_transcript = gr.Checkbox(label="Use Edited Transcript (check if you want to make edits to the initially generated transcript)", value=False)
-    edited_transcript = gr.Textbox(label="Edit Transcript Here. E.g., mark edits in the text with clear instructions. E.g., '[ADD DEFINITION OF MATERIOMICS]'.", lines=20, visible=False,
                                    show_copy_button=True, interactive=False)
-    user_feedback = gr.Textbox(label="Provide Feedback or Notes", lines=10, #placeholder="Enter your feedback or notes here..."
-                              )
-    regenerate_btn = gr.Button("Regenerate Audio with Edits and Feedback")
     # Function to update the interactive state of edited_transcript
     def update_edit_box(checkbox_value):
         return gr.update(interactive=checkbox_value, lines=20 if checkbox_value else 20, visible=True if checkbox_value else False)
@@ -933,10 +903,6 @@ with gr.Blocks(title="PDF to Audio", css="""
         inputs=[error_output],
         outputs=[]
     )
-    # Add README content at the bottom
-    gr.Markdown("---")  # Horizontal line to separate the interface from README
-    gr.Markdown(read_readme())
 # Enable queueing for better performance
 demo.queue(max_size=20, default_concurrency_limit=32)

 from pypdf import PdfReader
 from tenacity import retry, retry_if_exception_type
 # Define multiple sets of instruction templates
 INSTRUCTION_TEMPLATES = {
 ################# PODCAST ##################
         edited_transcript=edited_transcript_processed,
         user_feedback=user_feedback_processed
     )
     print('llm_output:', llm_output)
     # Generate audio from the transcript
 """) as demo:
     with gr.Row(elem_id="header"):
+        gr.Markdown("# PDFを音声ポッドキャスト・講義・要約などに変換\n\nまず、1つ以上のPDFをアップロードし、オプションを選択してから「音声を生成」ボタンを押してください。\n\nカスタムオプションも選択でき、生成方法を細かく指示できます。", elem_id="title")
     #gr.Markdown("")
     with gr.Row(elem_id="main_container"):
         with gr.Column(scale=2):
+            files = gr.Files(label="PDFファイル", file_types=[], )
             openai_api_key = gr.Textbox(
+                label="OpenAI APIキー",
                 visible=True,  # Always show the API key field
+                placeholder="ここにOpenAI APIキーを入力してください...",
                 type="password"  # Hide the API key input
             )
             text_model = gr.Dropdown(
+                label="テキスト生成モデル",
                 choices=STANDARD_TEXT_MODELS,
                 value="o1-preview-2024-09-12", #"gpt-4o-mini",
+                info="対話テキストを生成するモデルを選択してください。",
             )
             audio_model = gr.Dropdown(
+                label="音声生成モデル",
                 choices=STANDARD_AUDIO_MODELS,
                 value="tts-1",
+                info="音声を生成するモデルを選択してください。",
             )
             speaker_1_voice = gr.Dropdown(
+                label="話者1の声",
                 choices=STANDARD_VOICES,
                 value="alloy",
+                info="話者1の音声を選択してください。",
             )
             speaker_2_voice = gr.Dropdown(
+                label="話者2の声",
                 choices=STANDARD_VOICES,
                 value="echo",
+                info="話者2の音声を選択してください。",
             )
             api_base = gr.Textbox(
+                label="カスタムAPIベースURL",
+                placeholder="カスタム/ローカルモデルを使う場合はAPIベースURLを入力してください...",
+                info="カスタムやローカルモデルを使う場合、ここにAPIベースURLを入力してください。例: http://localhost:8080/v1 (llama.cpp RESTサーバー用)",
             )
         with gr.Column(scale=3):
             template_dropdown = gr.Dropdown(
+                label="指示テンプレート",
                 choices=list(INSTRUCTION_TEMPLATES.keys()),
                 value="podcast",
+                info="使用する指示テンプレートを選択してください。各フィールドを編集してカスタマイズも可能です。",
             )
+            with gr.Accordion("プロンプト", open=False):
+                intro_instructions = gr.Textbox(
+                    label="イントロ指示",
+                    lines=10,
+                    value=INSTRUCTION_TEMPLATES["podcast"]["intro"],
+                    info="対話生成のためのイントロ指示を入力してください。",
+                )
+                text_instructions = gr.Textbox(
+                    label="テキスト分析指示",
+                    lines=10,
+                    placeholder="テキスト分析の指示を入力してください...",
+                    value=INSTRUCTION_TEMPLATES["podcast"]["text_instructions"],
+                    info="生データやテキストの分析指示を入力してください。",
+                )
+                scratch_pad_instructions = gr.Textbox(
+                    label="ブレインストーミング指示",
+                    lines=15,
+                    value=INSTRUCTION_TEMPLATES["podcast"]["scratch_pad"],
+                    info="プレゼン/対話内容のブレインストーミング指示を入力してください。",
+                )
+                prelude_dialog = gr.Textbox(
+                    label="プレリュード指示",
+                    lines=5,
+                    value=INSTRUCTION_TEMPLATES["podcast"]["prelude"],
+                    info="プレゼン/対話作成前のプレリュード指示を入力してください。",
+                )
+                podcast_dialog_instructions = gr.Textbox(
+                    label="ポッドキャスト対話指示",
+                    lines=20,
+                    value=INSTRUCTION_TEMPLATES["podcast"]["dialog"],
+                    info="プレゼンやポッドキャスト対話生成の指示を入力してください。",
+                )
+    submit_btn = gr.Button("音声を生成", elem_id="submit_btn", variant="primary")
+    audio_output = gr.Audio(label="音声", format="mp3", interactive=False, autoplay=False)
+    transcript_output = gr.Textbox(label="書き起こしテキスト", lines=20, show_copy_button=True)
+    original_text_output = gr.Textbox(label="元テキスト", lines=10, visible=False)
     error_output = gr.Textbox(visible=False)  # Hidden textbox to store error message
+    use_edited_transcript = gr.Checkbox(label="書き起こしテキストを編集する（チェックすると編集欄が有効化）", value=False)
+    edited_transcript = gr.Textbox(label="編集用テキスト欄（例: '[ここに定義を追加]' など明確な指示を記載）", lines=20, visible=False,
                                    show_copy_button=True, interactive=False)
+    user_feedback = gr.Textbox(label="フィードバック・メモ", lines=10)
+    regenerate_btn = gr.Button("編集・フィードバックで再生成")
     # Function to update the interactive state of edited_transcript
     def update_edit_box(checkbox_value):
         return gr.update(interactive=checkbox_value, lines=20 if checkbox_value else 20, visible=True if checkbox_value else False)
         inputs=[error_output],
         outputs=[]
     )
 # Enable queueing for better performance
 demo.queue(max_size=20, default_concurrency_limit=32)