Setting the data directories and choosing an example transcript to work with

import os
data_root = '/ws/data/voice-identity/supreme-court'
transcript_dirpath = os.path.join(data_root, 'transcripts')
audio_dirpath = os.path.join(data_root, 'full_audios')
ls = [f for f in sorted(os.listdir(transcript_dirpath)) if f[0] != '.']
os.path.join(transcript_dirpath, ls[0])

'/ws/data/voice-identity/supreme-court/transcripts/12989_transcript.pickle'

Gets the necessary data into variables

from prcvd.models.oyez import OyezAudioDataBunch

db = OyezAudioDataBunch()
db.create(
    transcript_dirpath=transcript_dirpath,
    audio_dirpath=audio_dirpath,
    url='https://apps.oyez.org/player/#/roberts10/oral_argument_audio/13101'    
)

db.bunch_sections[0].transcript_section.section

{'raw': <p class="ng-binding ng-scope ng-isolate-scope" ng-class="{'active': sync.synced, 'search-result': player.isSearchResult(textBlock), 'clipping-active': player.drawer == 'clip', 'clipped': player.isClipped(textBlock)}" ng-click="player.onTextClick(textBlock)" ng-repeat="textBlock in turn.text_blocks" oyez-scroll-to="sync.synced || player.isCurrentSearchResult(textBlock)" oyez-sync="" start-time="0" stop-time="13.064" sync-queue="textBlocks"> Number 523, Joan Greenway Collins et al., versus American Buslines Incorporated, Respondent Employer. </p>,
 'case_name': 'Collins v. American Buslines, Inc.',
 'conv_date': datetime.datetime(1956, 3, 29, 0, 0),
 'speaker': 'Earl Warren',
 'start_time': 0.0,
 'stop_time': 13.064,
 'transcript': ' Number 523, Joan Greenway Collins et al., versus American Buslines Incorporated, Respondent Employer. '}

import matplotlib.pyplot as plt
import pandas as pd

from prcvd.talk_time_pipeline import (compute_talk_time, compute_prop_talk_time, compute_rolling_talk_timer,
                                nice_axes, compute_speaker_stats)

speaker_stats = compute_speaker_stats(databunch=db)
rtt,rpt = compute_rolling_talk_timer(
    sorted_transcript=db.transcript.transcript, 
    speakers_seed=list(speaker_stats.keys())
)

df = pd.DataFrame(rpt)
df.index = df['ts']
del df['ts']

ts_perc = 0.999
ts = df.index.values[round(len(df)*ts_perc)]
fig, ax = plt.subplots(figsize=(4, 2.5), dpi=300)
colors = plt.cm.Dark2(range(6))
s = df.loc[ts]
y = s.index
width = s.values
ax.barh(y=y, width=width, color=colors);
ax.set_title('recording second={} ({}% complete)'.format(ts, ts_perc*100), fontsize='smaller')

nice_axes(ax)

from random import randrange
from ncls import NCLS

class UnlabeledSection:
    def __init__(self, start_sec, close_sec, audio_data):
        self.start_sec = start_sec
        self.close_sec = close_sec
        self.audio_data = audio_data
        
        self.start_tsf = self._tsf_from_sec(
            sec=self.start_sec, sample_rate=self.audio_data.sample_rate
        )
        self.close_tsf = self._tsf_from_sec(
            sec=self.close_sec, sample_rate=self.audio_data.sample_rate
        )

        
    def _tsf_from_sec(self, sec, sample_rate):
        return sec * sample_rate
    
    
def get_start_stop(start_sec, close_sec, sample_rate):
    """
    Discretizes the float (second time) index to sample index
    
    
    """
    ss = int(start_sec * sample_rate) + 1
    cs = int(close_sec * sample_rate)
    return ss, cs
    
    
def sample_from_waveform(audio_data, sample_len):
    close_sec = randrange(
        round(round(audio_data.waveform.shape[1] / audio_data.sample_rate) - sample_len)
    )
    start_sec = close_sec - sample_len
    return UnlabeledSection(
        start_sec=start_sec, 
        close_sec=close_sec,
        audio_data=audio_data.slice_waveform(start_sec=start_sec, close_sec=close_sec)
    )

def get_ncls_format(ordered_transcript, sample_rate):
    time_indexed_sections = {}
    starts = []
    ends = []
    index = []
    for section in ordered_transcript:
        ss, cs = get_start_stop(
            start_sec=section.section['start_time'],
            close_sec=section.section['stop_time'],
            sample_rate=sample_rate
        )
        starts.append(ss)
        ends.append(cs)
        index.append(cs)
        time_indexed_sections[cs] = section
        
    return time_indexed_sections, starts, ends, index

time_indexed_sections, starts, ends, index = get_ncls_format(
    ordered_transcript=db.transcript.transcript, sample_rate=db.audio_data.sample_rate
)

import numpy as np
ints = NCLS(starts=np.array(starts), ends=np.array(ends), ids=np.array(index))
s1 = sample_from_waveform(audio_data=db.audio_data, sample_len=10.0)

Planning for model architecture

"We show that artificially augmenting the training data with noises and reverberation is a highly effective strategy for improving performance in DNN embedding systems." Action: create functions that allow for waveform noise and reverb (what kind of noise? what kind of reverb?)

Plotting the waveform

Example that verifies that the audio is loaded correctly.

import IPython

audios = [fp for fp in os.listdir(audio_dirpath) if fp[0] != '.']
IPython.display.Audio(os.path.join(audio_dirpath, audios[0]), 
                      rate=sample_rate)

Example that verifies the slicer is working

slc = slice_waveform(
        waveform=waveform,
        start_sec=0, 
        close_sec=10,
        sample_rate=sample_rate
)

IPython.display.Audio(slc, rate=sample_rate)

def get_audio_from_transcript_section(transcript_section, waveform, sample_rate):
    """
    Helper function for slicing based on the oyez transcript section format.
    """
    return slice_waveform(
        waveform=waveform,
        start_sec=float(transcript_section['start_time']),
        close_sec=float(transcript_section['stop_time']),
        sample_rate=sample_rate
    )

Single transcript segment example setup

idx = 1
k1 = list(transcript['transcript'].keys())[idx]
t1 = transcript['transcript'][k1]
audio_section = get_audio_from_transcript_section(
    transcript_section=t1,
    waveform=waveform,
    sample_rate=sample_rate
)
print(t1['transcript'])
IPython.display.Audio(audio_section, rate=sample_rate)

 Mastro, at first, had the corporation of its employees, who would do there membership from their presently designated representative, the carpenters, and joined 318.

t1

{'raw': <p class="ng-binding ng-scope ng-isolate-scope" ng-class="{'active': sync.synced, 'search-result': player.isSearchResult(textBlock), 'clipping-active': player.drawer == 'clip', 'clipped': player.isClipped(textBlock)}" ng-click="player.onTextClick(textBlock)" ng-repeat="textBlock in turn.text_blocks" oyez-scroll-to="sync.synced || player.isCurrentSearchResult(textBlock)" oyez-sync="" start-time="6.014" stop-time="17.696" sync-queue="textBlocks"> Mastro, at first, had the corporation of its employees, who would do there membership from their presently designated representative, the carpenters, and joined 318. </p>,
 'case_name': 'Mastro Plastics Corporation v. National Labor Relations Board',
 'conv_type': 'Opinion Announcement',
 'conv_date': 'February 27, 1956',
 'speaker': 'Mr. Attorney General',
 'start_time': '6.014',
 'stop_time': '17.696',
 'transcript': ' Mastro, at first, had the corporation of its employees, who would do there membership from their presently designated representative, the carpenters, and joined 318. '}

import subprocess
def force_align(tempdir, outdir, section_id, transcript_text, 
                audio_section, sample_rate):
    """
    Force aligns transcript_text with audio waveform (audio_section).
    
    Requires: lowerquality/gentle following installation procedures.
    
    In:
        tempdir: str, path that runtime has access to for temp storage.
        outdir: str, path that the force-aligned output will be written
        section_id: str, {case_name}_{conv_type}_{speaker}_{start_time}_{stop_time}
        transcript_text: str, transcription to align
        audio_section: torch.Tensor, audio waveform that contains transcription_text
        sample_rate: 
    
    Note: process writes 2 temp files.  1 wav and 1 txt.
    
    TODO: add log lines
    """
    wav_fp = os.path.join(tempdir, 'temp_'+section_id+'.wav')
    txt_fp = os.path.join(tempdir, 'temp_'+section_id+'.txt')
    out_fp = os.path.join(outdir, section_id+'.json')
    if os.path.exists(out_fp): return
    
    write_wav(
        fp=wav_fp, 
        sample_rate=sample_rate, 
        audio_section=audio_section
    )
    write_transcript_section(
        fp=txt_fp,
        transcript_text=transcript_text
    )
    cmd = ['python3', 
           '/Users/free-soellingeraj/code/gentle/align.py',
           wav_fp, txt_fp, '-o', out_fp]
    var = subprocess.call(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
    )
    os.remove(wav_fp)
    os.remove(txt_fp)
    return var

Force align all transcript segments in the example

%%time
case_name = db.transcript.transcript[0].section['case_name'].replace(' ', '').replace('v.', 'vs')
outdir = os.path.join(os.getcwd(), case_name)
os.makedirs(outdir, exist_ok=True)
tempdir = os.path.join(os.getcwd(), 'align_temp')
os.makedirs(tempdir, exist_ok=True)

section_idx = '{case_name}_{conv_type}_{speaker}_{start_time}_{stop_time}'
for transcript_section in transcript['transcript'].values():
    section_id = section_idx.format(
        case_name=case_name,
        conv_type=transcript_section['conv_type'],
        speaker=transcript_section['speaker'],
        start_time=transcript_section['start_time'],
        stop_time=transcript_section['stop_time']
    )
    force_align(
        tempdir=tempdir,
        outdir=outdir,
        section_id=section_id,
        transcript_text=transcript_section['transcript'],
        audio_section=get_audio_from_transcript_section(
            transcript_section=transcript_section,
            waveform=waveform,
            sample_rate=sample_rate
        ),
        sample_rate=sample_rate
    )

CPU times: user 664 ms, sys: 1.98 s, total: 2.65 s
Wall time: 43min 53s

Force alignment experiments

Force aligment uses its own pre-trained models to align text to waveform. These models have inaccuracies like any models that need to be understood. Questions: 1) how can we identify from the output of force-aligment when the text transcript is not entirely contained in the audio?
2) how can we identify from the output of force-aligment when the audio contains more than just the text transcript provided?
3) how well do the aligmnments work for random single syllable words? 2 syllable, 3, ...?
4) how well do the aligments work for the underlying phones? 2 phone, 3 phone, etc..?