How to set chat_input value from javascript

I’m currently try to add a client side STT button between chat input box, so after user click this button, user can input text with speech. I have searched the community and I can see there are solutions for speech input, but need to call a API for speech recognition, I only want the windows native STT.

My solution is add some javascript code to dynamically create a button beside the chat input, and after user input the voice, browser TTS will recognize the text, and I will then put this transcript into the chat input, the scripts looks like this:

<script>
    function createSpeechButton() {
        // Create the button element
        const button = document.createElement("button");
        button.className = "";
        button.style.border = "none";
        button.style.width = "48px";
        button.setAttribute("tabindex", "1");
        button.setAttribute("type", "button");

        // Create the SVG element
        const svg = document.createElementNS("http://www.w3.org/2000/svg", "svg");
        svg.className = "";
        svg.setAttribute("focusable", "false");
        svg.setAttribute("aria-hidden", "true");
        svg.setAttribute("viewBox", "0 0 24 24");
        svg.setAttribute("data-testid", "MicNoneIcon");

        // Create the path inside the SVG
        const path = document.createElementNS("http://www.w3.org/2000/svg", "path");
        path.setAttribute("d", "M12 14c1.66 0 2.99-1.34 2.99-3L15 5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3m-1.2-9.1c0-.66.54-1.2 1.2-1.2.66 0 1.2.54 1.2 1.2l-.01 6.2c0 .66-.53 1.2-1.19 1.2-.66 0-1.2-.54-1.2-1.2zm6.5 6.1c0 3-2.54 5.1-5.3 5.1S6.7 14 6.7 11H5c0 3.41 2.72 6.23 6 6.72V21h2v-3.28c3.28-.48 6-3.3 6-6.72z");
        svg.appendChild(path);

        // Create the span for touch ripple effect
        const rippleSpan = document.createElement("span");
        rippleSpan.className = "";
        rippleSpan.style.width = "48px";

        // Create inner ripple span
        const innerRipple = document.createElement("span");
        innerRipple.className = "";
        innerRipple.style.width = "41px";
        innerRipple.style.height = "41px";
        innerRipple.style.top = "-0.5px";
        innerRipple.style.left = "-0.5px";

        // Add child span to ripple
        const childRipple = document.createElement("span");
        childRipple.className = "";
        innerRipple.appendChild(childRipple);

        // Append ripple spans
        rippleSpan.appendChild(innerRipple);

        // Append SVG and ripple effect to the button
        button.appendChild(svg);
        button.appendChild(rippleSpan);

        return button;
    }

    function enableSpeechRecognition() {
        // check if local speech recognition is supported.
        const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
        if (!SpeechRecognition) {
            console.log('Browser does not support speech recognition.');
            return;
        }

        const recognition = new SpeechRecognition();
        recognition.lang = 'en-US';
        recognition.continuous = false;
        recognition.interimResults = false;

        // find the chat input textarea.
        var voiceInput = parent.document.querySelector('textarea[data-testid="stChatInputTextArea"]');
        var speechButton = createSpeechButton();
        voiceInput.parentElement.prepend(speechButton);

        recognition.addEventListener('result', (event) => {{
            const transcript = event.results[0][0].transcript;
            if (transcript === '') {
                return;
            }

            console.log('Speech recognition result:', transcript);

            voiceInput.value = transcript;
            voiceInput.dispatchEvent(new Event('input', { bubbles: true }));
        }});

        recognition.addEventListener('end', () => {{
            console.log('Speech recognition ended.');
        }});

        recognition.addEventListener('error', (event) => {{
            console.error('Speech recognition error:', event.error);
        }});

        // start after click a button.
        speechButton.addEventListener('click', () => {{
            recognition.start();
        }});
    }
    enableSpeechRecognition();
</script>

This script is then loaded into streamlit app like this:

        user_input = st.chat_input("You:")
        st.components.v1.html(load_speech_recognition_html())

I can see that this is almost working, after user end the speak, the text is set in chat input, the problem is after we click the chat input window (or focus on the chat input window), the text will be cleared.

Is there an easy way to solve this problem?

Thank you very much.