Looking at Oyez Transcripts
Data versioning is an area of new development occuring under the industrial header of MLOps. MLOps is new the software industry dedicated to the elimination of development lifecycle anti-patterns that are new to the creation and maintenance of machine learning pipelines. Easily one of the most impactful open source projects in the last century was Git created by Linus Torvalds et. al. starting in 2005. There are 2 main capabilities that will be examined in this blog. First, we will examine the extent to which each technology can deliver reproducible data science which is to say, that a project can be bundled up by a data scientist and passed to another person who can easily reproduce their results and build upon them. The second is the extent to which datasets may be safely altered, even for hypothetical (or counterfactual) representations, without worry of interfering with the master branch.
- Setting the data directories and choosing an example transcript to work with
- Gets the necessary data into variables
- Planning for model architecture
- Plotting the waveform
- Example that verifies that the audio is loaded correctly.
- Example that verifies the slicer is working
- Single transcript segment example setup
- Force align all transcript segments in the example
- Force alignment experiments
import os
data_root = '/ws/data/voice-identity/supreme-court'
transcript_dirpath = os.path.join(data_root, 'transcripts')
audio_dirpath = os.path.join(data_root, 'full_audios')
ls = [f for f in sorted(os.listdir(transcript_dirpath)) if f[0] != '.']
os.path.join(transcript_dirpath, ls[0])
from prcvd.models.oyez import OyezAudioDataBunch
db = OyezAudioDataBunch()
db.create(
transcript_dirpath=transcript_dirpath,
audio_dirpath=audio_dirpath,
url='https://apps.oyez.org/player/#/roberts10/oral_argument_audio/13101'
)
db.bunch_sections[0].transcript_section.section
import matplotlib.pyplot as plt
import pandas as pd
from prcvd.talk_time_pipeline import (compute_talk_time, compute_prop_talk_time, compute_rolling_talk_timer,
nice_axes, compute_speaker_stats)
speaker_stats = compute_speaker_stats(databunch=db)
rtt,rpt = compute_rolling_talk_timer(
sorted_transcript=db.transcript.transcript,
speakers_seed=list(speaker_stats.keys())
)
df = pd.DataFrame(rpt)
df.index = df['ts']
del df['ts']
ts_perc = 0.999
ts = df.index.values[round(len(df)*ts_perc)]
fig, ax = plt.subplots(figsize=(4, 2.5), dpi=300)
colors = plt.cm.Dark2(range(6))
s = df.loc[ts]
y = s.index
width = s.values
ax.barh(y=y, width=width, color=colors);
ax.set_title('recording second={} ({}% complete)'.format(ts, ts_perc*100), fontsize='smaller')
nice_axes(ax)
from random import randrange
from ncls import NCLS
class UnlabeledSection:
def __init__(self, start_sec, close_sec, audio_data):
self.start_sec = start_sec
self.close_sec = close_sec
self.audio_data = audio_data
self.start_tsf = self._tsf_from_sec(
sec=self.start_sec, sample_rate=self.audio_data.sample_rate
)
self.close_tsf = self._tsf_from_sec(
sec=self.close_sec, sample_rate=self.audio_data.sample_rate
)
def _tsf_from_sec(self, sec, sample_rate):
return sec * sample_rate
def get_start_stop(start_sec, close_sec, sample_rate):
"""
Discretizes the float (second time) index to sample index
"""
ss = int(start_sec * sample_rate) + 1
cs = int(close_sec * sample_rate)
return ss, cs
def sample_from_waveform(audio_data, sample_len):
close_sec = randrange(
round(round(audio_data.waveform.shape[1] / audio_data.sample_rate) - sample_len)
)
start_sec = close_sec - sample_len
return UnlabeledSection(
start_sec=start_sec,
close_sec=close_sec,
audio_data=audio_data.slice_waveform(start_sec=start_sec, close_sec=close_sec)
)
def get_ncls_format(ordered_transcript, sample_rate):
time_indexed_sections = {}
starts = []
ends = []
index = []
for section in ordered_transcript:
ss, cs = get_start_stop(
start_sec=section.section['start_time'],
close_sec=section.section['stop_time'],
sample_rate=sample_rate
)
starts.append(ss)
ends.append(cs)
index.append(cs)
time_indexed_sections[cs] = section
return time_indexed_sections, starts, ends, index
time_indexed_sections, starts, ends, index = get_ncls_format(
ordered_transcript=db.transcript.transcript, sample_rate=db.audio_data.sample_rate
)
import numpy as np
ints = NCLS(starts=np.array(starts), ends=np.array(ends), ids=np.array(index))
s1 = sample_from_waveform(audio_data=db.audio_data, sample_len=10.0)
"We show that artificially augmenting the training data with noises and reverberation is a highly effective strategy for improving performance in DNN embedding systems." Action: create functions that allow for waveform noise and reverb (what kind of noise? what kind of reverb?)
import IPython
audios = [fp for fp in os.listdir(audio_dirpath) if fp[0] != '.']
IPython.display.Audio(os.path.join(audio_dirpath, audios[0]),
rate=sample_rate)