Running it locally on ubuntu, I have created a system which tracks my finger movement and when a closed shape is created by the tracking line, it draws a circle over it. On streamlit app, when the circle is drawn, it starts printing null arrays below the frame. I am not printing anything and cannot identify the issue. Sharing the code below:
import streamlit as st
import cv2
import numpy as np
import time
import mediapipe as mp
import ffmpeg
import librosa
from helper_functions import *
from classes import STT_and_class_detector
st.session_state.output_video_file = 'output_video.avi'
st.session_state.output_directory = 'frames_output'
st.session_state.output_audio_file = 'output_audio.wav'
st.session_state.audio_format = 'wav'
def run_recording(placeholder):
annotations = [[]]
annotationNumber = -1
annotationStart = False
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5)
frame_width = 640
frame_height = 480
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, frame_width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_height)
fps = int(cap.get(cv2.CAP_PROP_FPS))
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(st.session_state.output_video_file, fourcc, 20, (frame_width, frame_height))
frames = []
os.makedirs(st.session_state.output_directory, exist_ok=True)
frame_delay = 0
counter_list = []
unique_circles = []
final_circles = []
previous_length = 0
list_of_focus = []
frame_count = 0
circle_stay = 0
last_circle = 0
ffmpeg_cmd = (
ffmpeg
.input('default', format='alsa', channels=1)
.output(st.session_state.output_audio_file, format=st.session_state.audio_format)
.overwrite_output()
.run_async(pipe_stdout=True, pipe_stderr=True)
)
tracking_lost_counter = 0
tracking_lost_threshold = 10
while True:
ret, frame = cap.read()
if not ret:
break
frame = cv2.flip(frame, 1)
second_frame = frame.copy()
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(rgb_frame)
if results.multi_hand_landmarks:
tracking_lost_counter = 0
for hand_landmarks in results.multi_hand_landmarks:
index_fingertip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
x = int(index_fingertip.x * frame.shape[1])
y = int(index_fingertip.y * frame.shape[0])
is_highest = True
for landmark_id, landmark in enumerate(hand_landmarks.landmark):
if landmark_id != 8 and landmark.y * frame.shape[0] < y:
is_highest = False
break
for landmark_id, landmark in enumerate(hand_landmarks.landmark):
if landmark_id == 17 and landmark.y * frame.shape[0] < y:
is_highest = True
break
if is_highest:
if not annotationStart:
annotationStart = True
annotationNumber += 1
annotations.append([])
annotations[annotationNumber].append((x, y))
cv2.circle(frame, (x, y), 5, (255, 0, 255), -1)
else:
annotationStart = False
else:
tracking_lost_counter += 1
if tracking_lost_counter > tracking_lost_threshold:
annotations = [[]]
annotationNumber = -1
annotationStart = False
frame_delay = 0
counter_list = []
tracking_lost_counter = 0
for i, annotation in enumerate(annotations):
for j in range(len(annotation)):
if j != 0:
cv2.line(frame, annotation[j - 1], annotation[j], (0, 0, 200), 2)
if len(annotations) > 1:
list_of_focus = annotations[-2]
if len(list_of_focus) > 1 and len(list_of_focus) != previous_length:
num_points = 10
x_values = np.linspace(list_of_focus[-2][0], list_of_focus[-1][0], num_points)
y_values = np.linspace(list_of_focus[-2][1], list_of_focus[-1][1], num_points)
for k in range(num_points):
counter_list.append((int(x_values[k]), int(y_values[k])))
previous_length = len(list_of_focus)
extracted_portion = selected_region(counter_list)
try:
circle_values = make_circle(second_frame, extracted_portion)
if circle_values is not None:
final_circles.append(circle_values)
if final_circles:
[unique_circles.append(item) for item in final_circles if item not in unique_circles]
filtered_circles = [circle for i, circle in enumerate(unique_circles)
if all(center_distance(circle, other_circle) >= 2 for other_circle in unique_circles[:i])]
if filtered_circles and len(filtered_circles) > 0:
if len(filtered_circles) > last_circle:
circle_stay = 30
if circle_stay > 0:
cv2.ellipse(second_frame, filtered_circles[-1][3], (0, 255, 0), 3)
circle_stay -= 1
last_circle = len(filtered_circles)
if circle_values is not None:
frame_delay += 1
if frame_delay > 5:
annotations = [[]]
annotationNumber = -1
annotationStart = False
frame_delay = 0
counter_list = []
frame = cv2.resize(frame, (frame_width, frame_height))
second_frame = cv2.resize(second_frame, (frame_width, frame_height))
combined_frame = cv2.addWeighted(frame, 0.5, second_frame, 0.5, 0)
stframe = cv2.cvtColor(combined_frame, cv2.COLOR_BGR2RGB)
placeholder.image(stframe, caption="Live Video Recording")
frames.append(combined_frame)
out.write(combined_frame)
frame_filename = os.path.join(st.session_state.output_directory, f'frame_{frame_count}.png')
cv2.imwrite(frame_filename, second_frame)
frame_count += 1
st.session_state.frame_count = frame_count
if st.session_state.get("complete_recording", False):
cap.release()
out.release()
ffmpeg_cmd.terminate()
break
except Exception as e:
pass
st.title("Object Interaction and Voice Command Interface")
if "page" not in st.session_state:
st.session_state.page = 0
if 'frame_count' not in st.session_state:
st.session_state.frame_count = 0
if 'duration' not in st.session_state:
st.session_state.duration = 0
if 'fps' not in st.session_state:
st.session_state.fps = 0
if 'transcript' not in st.session_state:
st.session_state.transcript = 'none'
if 'obj_list' not in st.session_state:
st.session_state.obj_list = None
if 'obj_timestamp' not in st.session_state:
st.session_state.obj_timestamp = None
def nextpage(): st.session_state.page += 1
def restart(): st.session_state.page = 0
placeholder = st.empty()
if st.session_state.page == 0:
placeholder.title("Select Connectivity Type")
col1, col2 = st.columns(2)
with col1:
st.button("WiFi", key="wifi")
with col2:
st.button("Ethernet", key="ethernet")
elif st.session_state.page == 1:
placeholder.title("Select Use Case")
col1, col2, col3 = st.columns(3)
with col1:
st.button("Defect Detection", key="defect_detection")
with col2:
st.button("Object Counting", key="obj_counting")
with col3:
st.button("Cycle Time Tracking", key="cyc_time_tracking")
elif st.session_state.page == 2:
# Replace the chart with several elements:
with placeholder.container():
st.text("1. Point to the object")
time.sleep(2)
st.text("2. Speak while pointing and tell me what you want to do")
time.sleep(2)
st.text("3. Make sure to point your index to the object")
time.sleep(2)
elif st.session_state.page == 3:
st.title("Live Drawing")
col1, col2 = st.columns(2)
with col1:
if st.button("Start Recording", key="start_recording"):
st.text("Video recording started")
placeholder = st.empty()
st.session_state.complete_recording = False
run_recording(placeholder)
with col2:
if st.button("Complete"):
st.session_state.complete_recording = True
else:
progress_bar = st.progress(0, "Processing Video: 0%")
progress_bar.progress(10, "Processing Video: 10%\t\tGetting Audio Duration")
duration = librosa.get_duration(path=st.session_state.output_audio_file)
progress_bar.progress(20, "Processing Video: 20%\t\tGetting FPS")
fps = st.session_state.frame_count / duration
progress_bar.progress(40, "Processing Video: 40%\t\tInitializing Class Detector")
obj1 = STT_and_class_detector(st.session_state.output_video_file, "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") # Define the class object
progress_bar.progress(60, "Processing Video: 60%\t\tTranscribing the Audio")
transcript = transcribe_audio(st.session_state.output_audio_file)
progress_bar.progress(70, "Processing Video: 70%\t\tDetecting Class")
obj1.class_detector_GPT('gpt-4', transcript[0]) # Detect class objects using GPT API
progress_bar.progress(85, "Processing Video: 85%\t\tFinding Timestampes")
obj1.find_timestamps(transcript[1])
progress_bar.progress(100, "Processing Video: 100%\t\tMaking Class Objects Directories")
obj1.object_directory(fps, st.session_state.output_directory) # Make class objects directories
print(str(st.session_state.frame_count))
print(str(duration))
print(str(fps))
print(str(transcript))
print(obj1.objects_list)
print(obj1.phrase_timestamps)
st.text("Frame Count: " + str(st.session_state.frame_count))
st.text("Duration: " + str(duration))
st.text("FPS: " + str(fps))
st.text("Transcript: " + str(transcript))
st.text("Objects List: " + str(obj1.objects_list))
st.text("Phrase Timestamps: " + str(obj1.phrase_timestamps))
button_center_css = """
<style>
.center-button {
display: flex;
justify-content: center;
}
</style>
"""
# Inject the CSS into the Streamlit app
st.markdown(button_center_css, unsafe_allow_html=True)
# Use the class to center the button
st.markdown('<div class="center-button">', unsafe_allow_html=True)
st.button("Next", on_click=nextpage, disabled=(st.session_state.page > 3), key="next")
st.markdown('</div>', unsafe_allow_html=True)