Many thanks to Nora Gulick for developing a Python module for tabulating and summarizing codes assigned to turns in conversations. This code can be used to generate summaries of how many times each speaker in a multi-party conversation produced each kind of move. It produces a handy-dandy spreadsheet featuring this data.
When referring to this module in presentations and publications, please use the following citation:
Gulick, E. (2024). Python code for discourse structure analysis synoptic summary. The USeFuL Project. https://useful-discourse.info/sfl-resources/discourse-tabulation-and-summary-module/
Please see below for the code:
“””
Created on Sat Oct 9 09:31:36 2021
@author: Eleanor Gulick
“””
#data extraction coarse discourse structure analysis synoptic summary
import numpy as np
import pandas
import nltk
import matplotlib.pyplot as plt
#Declare filenames without extension
filename = “Filename”
#Read in file and convert to array
#Add extension to filename
data = pandas.read_excel(filename+”.xlsx”,engine=’openpyxl’)
data_arr = np.array(data)
#pull out names to check correctness
names=set(data_arr[:,1])
names = list(names)
#pull out move types to check labeling correctness
#variable names can never have spaces
move_type=set(data_arr[:,3])
move_type = list(move_type)
valid_move_types=[‘0′,’Open’,’Continue’,’R:Respond’,’R:Rejoinder’, ‘Repair’]
#Correct common move type errors
for i in range(len(data_arr[:,3])):
for j in range(len(valid_move_types)):
if valid_move_types[j] in str(data_arr[i,3]):
data_arr[i,3] = valid_move_types[j]
break
df = pandas.DataFrame(data=data_arr[:,0:4],columns=[“Time”,”Speaker”,”Utterance”,”Move Type”])
df.to_excel(filename+’.xlsx’,index=False)
#finding cells with errors in data entry
print(‘Invalid move type rows:’)
for i in range(len(data_arr[:,3])):
match=False
for j in range(len(valid_move_types)):
if str(data_arr[i,3])==valid_move_types[j]:
match=True
if match==False:
print(i+2)
valid_names=[‘Name1′,’Name2′,’Name3’]
#Correct common errors in names
for i in range(len(data_arr[:,1])):
for j in range(len(valid_names)):
if valid_names[j] in data_arr[i,1]:
data_arr[i,1] = valid_names[j]
break
df = pandas.DataFrame(data=data_arr[:,0:4],columns=[“Time”,”Speaker”,”Utterance”,”Move Type”])
df.to_excel(filename+’.xlsx’,index=False)
print(‘Invalid name rows:’)
for i in range(len(data_arr[:,1])):
match=False
for j in range(len(valid_names)):
if str(data_arr[i,1])==valid_names[j]:
match=True
if match==False:
print(i+2)
#Create a data structure and extract data
data_struct = []
for i in range(len(valid_names)):
data_struct.append({‘name’:valid_names[i]})
for j in range(len(valid_move_types)):
data_struct[i][valid_move_types[j]] = {‘times’:[], ‘row_num’:[]}
for i in range(len(data_arr[:,1])):
time = str(data_arr[i,0])
name = str(data_arr[i,1])
move_type = str(data_arr[i,3])
for j in range(len(data_struct)):
if name==data_struct[j][‘name’]:
if move_type in valid_move_types:
data_struct[j][move_type][‘row_num’].append(i+2)
data_struct[j][move_type][‘times’].append(time)
#Total move counts (rows=name, columns=move types)
move_counts = np.zeros((len(valid_names),len(valid_move_types)))
for i in range(len(valid_names)):
for j in range(len(valid_move_types)):
move_counts[i,j] = len(data_struct[i][valid_move_types[j]][‘row_num’])
print(‘\nMove counts:’)
print(move_counts)
#Convert names to numpy array and concatenate with move counts array
export_move_counts = np.column_stack((np.array(valid_names),move_counts))
#Convert back to data frame
df = pandas.DataFrame(data=export_move_counts,columns=[‘Name’,’0′,’Open’,’Continue’,’R:Respond’,’R:Rejoinder’, ‘Repair’, ‘X’])
#Write data frame to new excel file
filepath = ‘Filename.xlsx’
df.to_excel(‘Filename.xlsx’,index=False)