Discourse tabulation and summary module

Many thanks to Nora Gulick for developing a Python module for tabulating and summarizing codes assigned to turns in conversations. This code can be used to generate summaries of how many times each speaker in a multi-party conversation produced each kind of move. It produces a handy-dandy spreadsheet featuring this data.

When referring to this module in presentations and publications, please use the following citation:

Gulick, E. (2024).  Python code for discourse structure analysis synoptic summary. The USeFuL Project. https://useful-discourse.info/sfl-resources/discourse-tabulation-and-summary-module/

Please see below for the code:

“””

Created on Sat Oct  9 09:31:36 2021

@author: Eleanor Gulick

“””

#data extraction coarse discourse structure analysis synoptic summary

import numpy as np

import pandas

import nltk

import matplotlib.pyplot as plt

#Declare filenames without extension

filename = “Filename”

#Read in file and convert to array

#Add extension to filename

data = pandas.read_excel(filename+”.xlsx”,engine=’openpyxl’)

data_arr = np.array(data)

#pull out names to check correctness

names=set(data_arr[:,1])

names = list(names)

#pull out move types to check labeling correctness

#variable names can never have spaces

move_type=set(data_arr[:,3])

move_type = list(move_type)

valid_move_types=[‘0′,’Open’,’Continue’,’R:Respond’,’R:Rejoinder’, ‘Repair’]

#Correct common move type errors

for i in range(len(data_arr[:,3])):

    for j in range(len(valid_move_types)):

        if valid_move_types[j] in str(data_arr[i,3]):

            data_arr[i,3] = valid_move_types[j]

            break

df = pandas.DataFrame(data=data_arr[:,0:4],columns=[“Time”,”Speaker”,”Utterance”,”Move Type”])

df.to_excel(filename+’.xlsx’,index=False)

#finding cells with errors in data entry

print(‘Invalid move type rows:’)

for i in range(len(data_arr[:,3])):

    match=False

    for j in range(len(valid_move_types)):

        if str(data_arr[i,3])==valid_move_types[j]:

            match=True

    if match==False:

        print(i+2)

valid_names=[‘Name1′,’Name2′,’Name3’]

#Correct common errors in names

for i in range(len(data_arr[:,1])):

    for j in range(len(valid_names)):

        if valid_names[j] in data_arr[i,1]:

            data_arr[i,1] = valid_names[j]

            break

df = pandas.DataFrame(data=data_arr[:,0:4],columns=[“Time”,”Speaker”,”Utterance”,”Move Type”])

df.to_excel(filename+’.xlsx’,index=False)

print(‘Invalid name rows:’)

for i in range(len(data_arr[:,1])):

    match=False

    for j in range(len(valid_names)):

        if str(data_arr[i,1])==valid_names[j]:

            match=True

    if match==False:

        print(i+2)

#Create a data structure and extract data

data_struct = []

for i in range(len(valid_names)):

    data_struct.append({‘name’:valid_names[i]})

    for j in range(len(valid_move_types)):

        data_struct[i][valid_move_types[j]] = {‘times’:[], ‘row_num’:[]}

for i in range(len(data_arr[:,1])):

    time = str(data_arr[i,0])

    name = str(data_arr[i,1])

    move_type = str(data_arr[i,3])

    for j in range(len(data_struct)):

        if name==data_struct[j][‘name’]:

            if move_type in valid_move_types:

                data_struct[j][move_type][‘row_num’].append(i+2)

                data_struct[j][move_type][‘times’].append(time)

#Total move counts (rows=name, columns=move types)

move_counts = np.zeros((len(valid_names),len(valid_move_types)))

for i in range(len(valid_names)):

    for j in range(len(valid_move_types)):

        move_counts[i,j] = len(data_struct[i][valid_move_types[j]][‘row_num’])

print(‘\nMove counts:’)

print(move_counts)

#Convert names to numpy array and concatenate with move counts array

export_move_counts = np.column_stack((np.array(valid_names),move_counts))

#Convert back to data frame

df = pandas.DataFrame(data=export_move_counts,columns=[‘Name’,’0′,’Open’,’Continue’,’R:Respond’,’R:Rejoinder’, ‘Repair’, ‘X’])

#Write data frame to new excel file

filepath = ‘Filename.xlsx’

df.to_excel(‘Filename.xlsx’,index=False)