import os.path import math def clean_up(s): """ Return a version of string str in which all letters have been converted to lowercase and punctuation characters have been stripped from both ends. Inner...

1 answer below »
Continue writing and test an existing programme using the instructions included in the attachments. Additional files in email!


import os.path import math def clean_up(s): """ Return a version of string str in which all letters have been converted to lowercase and punctuation characters have been stripped from both ends. Inner punctuation is left untouched. """ punctuation = '''!"',;:.-?)([]<>*#\n\t\r''' result = s.lower().strip(punctuation) return result def average_word_length(line_list): ''' Return the average length of all words in line_list. Do not include surrounding punctuation in words. text is a non-empty list of strings each ending in \n. At least one line in text contains a word.''' # To do: Replace this function's body to meet its specification. return 1.0 def unique_words_ratio(line_list): ''' Return the type token ratio (TTR) for this line_list. TTR is the number of different words divided by the total number of words. text is a non-empty list of strings each ending in \n. At least one line in text contains a word. ''' # To do: Replace this function's body to meet its specification. return 1.0 def hapax_legomana_ratio(line_list): ''' Return the hapax_legomana ratio for this text. This ratio is the number of words that occur exactly once divided by the total number of words. text is a list of strings each ending in \n. At least one line in text contains a word.''' # To do: Replace this function's body to meet its specification. return 1.0 def split_text(original, separators): '''Return a list of non-empty, non-blank strings from the original string determined by splitting the string on any of the separators. separators is a string of single-character separators.''' # To do: Complete this function's body to meet its specification. result = [] return result def average_sentence_length(text_from_file): ''' Return the average number of words per sentence in text. text_from_file is guaranteed to have at least one sentence. Sentence terminating punctuation defined as !?. A sentence is defined as a non-empty string of non-terminating punctuation surrounded by terminating punctuation or beginning or end of file. ''' # To do: Replace this function's body to meet its specification. return 1.0 def average_sentence_complexity(text_from_file): '''Return the average number of phrases per sentence. Terminating punctuation defined as !?. A sentence is defined as a non-empty string of non-terminating punctuation surrounded by terminating punctuation or beginning or end of file. Phrases are substrings of a sentences separated by one or more of the following delimiters ,;: ''' # To do: Replace this function's body to meet its specification. return 1.0 def get_valid_filename(prompt): '''Use prompt (a string) to ask the user to type the name of a file. If the file does not exist, keep asking until they give a valid filename. Return the name of that file.''' # To do: Complete this function's body to meet its specification. filename = input(prompt) return filename # Uncomment and use this statement as many times as needed for input: # filename = input(prompt) # Uncomment and use this statement as many times as needed for output: # print "That file does not exist." # Do not use any other input or output statements in this function. def read_directory_name(prompt): '''Use prompt (a string) to ask the user to type the name of a directory. If the directory does not exist, keep asking until they give a valid directory. ''' # To do: Complete this function's body to meet its specification. dirname = input(prompt) return dirname # Uncomment and use this statement as many times as needed for input: # dirname = input(prompt) # Uncomment and use this statement as many times as needed for output: # print("That directory does not exist.") # Do not use any other input or output statements in this function. def compare_signatures(sig1, sig2, weight): '''Return a non-negative real number indicating the similarity of two linguistic signatures. The smaller the number the more similar the signatures. Zero indicates identical signatures. sig1 and sig2 are 6 element lists with the following elements 0 : author name (a string) 1 : average word length (float) 2 : TTR (float) 3 : Hapax Legomana Ratio (float) 4 : average sentence length (float) 5 : average sentence complexity (float) weight is a list of multiplicative weights to apply to each linguistic feature. weight[0] is ignored. ''' # To do: Replace this function's body to meet its specification. return 0.0 def read_signature(filename): '''Read a linguistic signature from filename and return it as list of features. ''' file = open(filename, 'r') # the first feature is a string so it doesn't need casting to float result = [file.readline()] # all remaining features are real numbers for line in file: result.append(float(line.strip())) return result if __name__ == '__main__': prompt = 'enter the name of the file with unknown author: ' mystery_filename = get_valid_filename(prompt) # readlines gives us a list of strings one for each line of the file text = open(mystery_filename, 'r').readlines() # calculate the signature for the mystery file mystery_signature = [mystery_filename] mystery_signature.append(average_word_length(text)) mystery_signature.append(unique_words_ratio(text)) mystery_signature.append(hapax_legomana_ratio(text)) mystery_signature.append(average_sentence_length(text)) mystery_signature.append(average_sentence_complexity(text)) weights = [0, 11, 33, 50, 0.4, 4] prompt = 'enter the path to the directory of signature files: ' dir = read_directory_name(prompt) # every file in this directory must be a linguistic signature files = os.listdir(dir) # we will assume that there is at least one signature in that directory this_file = files[0] signature = read_signature('%s/%s'%(dir,this_file)) best_score = compare_signatures(mystery_signature, signature, weights) best_author = signature[0] for this_file in files[1:]: signature = read_signature('%s/%s'%(dir, this_file)) score = compare_signatures(mystery_signature, signature, weights) if score < best_score: best_score = score best_author = signature[0] print("best author match: %s with score %s"%(best_author, best_score)) complete and test (using module test_detect.py) an authorship detection programme using the outlined process. the task. you are given a program, detect.py, which will take a text file as input and calculate the score for certain patterns in the file. these scores can be compared with the previously calculated scores for texts written by known authors. you have a set of files containing mystery texts and files that contain signatures (author names) for the mystery texts. you will need to write the below functions that calculate a score for a particular literary feature of the text. linguistic features description function name average word length the average number of characters per word, calculated after the punctuation has been stripped using the clean_up function (rounded to 2 decimal places) average_word_length() type token ratio the number of different words used in a text divided by the total number of words (measures how repetitive the vocabulary is). use the provided clean_up function so "this","this","this," and "(this" are not counted as different words unique_words_ratio() hapax legomena ratio the number of words occurring exactly once in the text divided by the total number of words hapax_legomena_ratio() average number of words per sentence calculate the mean value for all sentences in the text average_words_in_sentence() average number of phrases per sentence average number of phrases per sentence. find the phrases by taking each sentence, as defined above, and splitting it on any of colon, semi-colon or comma. average_sentence_complexity() definitions: · token = string that you get from calling the string method split on a line of the file. · word = non-empty token from the file that isn't completely made up of punctuation. find the words in a file by using str.split to find the tokens and then remove the punctuation from the words using the clean_up function in detect.py. if after calling clean_up the resulting word is an empty string, then it isn't considered a word. · sentence = sequence of characters that: is terminated by (but doesn't include) the characters ! ? . or the end of the file; excludes whitespace on either end; and is not empty. create a single string that contains the entire file and then call split_text on that string. · phrases = non-empty sections of sentences that are separated by colons, commas, or semi-colons (:,;) task 1: since several features require the program to split a string on any of a set of different separators, write a helper function to do this task. to do this you will complete the function split_text as described by the docstring in the code best_score:="" best_score="score" best_author="signature[0]" print("best="" author="" match:="" %s="" with="" score="" %s"%(best_author,="" best_score))="" complete="" and="" test="" (using="" module="" test_detect.py)="" an="" authorship="" detection="" programme="" using="" the="" outlined="" process.="" the="" task.="" you="" are="" given="" a="" program,="" detect.py,="" which="" will="" take="" a="" text="" file="" as="" input="" and="" calculate="" the="" score="" for="" certain="" patterns="" in="" the="" file.="" these="" scores="" can="" be="" compared="" with="" the="" previously="" calculated="" scores="" for="" texts="" written="" by="" known="" authors.="" you="" have="" a="" set="" of="" files="" containing="" mystery="" texts="" and="" files="" that="" contain="" signatures="" (author="" names)="" for="" the="" mystery="" texts.="" you="" will="" need="" to="" write="" the="" below="" functions="" that="" calculate="" a="" score="" for="" a="" particular="" literary="" feature="" of="" the="" text.="" linguistic="" features="" description="" function="" name="" average="" word="" length="" the="" average="" number="" of="" characters="" per="" word,="" calculated="" after="" the="" punctuation="" has="" been="" stripped="" using="" the="" clean_up="" function="" (rounded="" to="" 2="" decimal="" places)="" average_word_length()="" type="" token="" ratio="" the="" number="" of="" different="" words="" used="" in="" a="" text="" divided="" by="" the="" total="" number="" of="" words="" (measures="" how="" repetitive="" the="" vocabulary="" is).="" use="" the="" provided="" clean_up="" function="" so="" "this","this","this,"="" and="" "(this"="" are="" not="" counted="" as="" different="" words="" unique_words_ratio()="" hapax="" legomena="" ratio="" the="" number="" of="" words="" occurring="" exactly="" once="" in="" the="" text="" divided="" by="" the="" total="" number="" of="" words="" hapax_legomena_ratio()="" average="" number="" of="" words="" per="" sentence="" calculate="" the="" mean="" value="" for="" all="" sentences="" in="" the="" text="" average_words_in_sentence()="" average="" number="" of="" phrases="" per="" sentence="" average="" number="" of="" phrases="" per="" sentence.="" find="" the="" phrases="" by="" taking="" each="" sentence,="" as="" defined="" above,="" and="" splitting="" it="" on="" any="" of="" colon,="" semi-colon="" or="" comma.="" average_sentence_complexity()="" definitions:="" ·="" token="string" that="" you="" get="" from="" calling="" the="" string="" method="" split="" on="" a="" line="" of="" the="" file.="" ·="" word="non-empty" token="" from="" the="" file="" that="" isn't="" completely="" made="" up="" of="" punctuation.="" find="" the="" words="" in="" a="" file="" by="" using="" str.split="" to="" find="" the="" tokens="" and="" then="" remove="" the="" punctuation="" from="" the="" words="" using="" the="" clean_up="" function="" in="" detect.py.="" if="" after="" calling="" clean_up="" the="" resulting="" word="" is="" an="" empty="" string,="" then="" it="" isn't="" considered="" a="" word.="" ·="" sentence="sequence" of="" characters="" that:="" is="" terminated="" by="" (but="" doesn't="" include)="" the="" characters="" !="" .="" or="" the="" end="" of="" the="" file;="" excludes="" whitespace="" on="" either="" end;="" and="" is="" not="" empty.="" create="" a="" single="" string="" that="" contains="" the="" entire="" file="" and="" then="" call="" split_text="" on="" that="" string.="" ·="" phrases="non-empty" sections="" of="" sentences="" that="" are="" separated="" by="" colons,="" commas,="" or="" semi-colons="" (:,;)="" task="" 1:="" since="" several="" features="" require="" the="" program="" to="" split="" a="" string="" on="" any="" of="" a="" set="" of="" different="" separators,="" write="" a="" helper="" function="" to="" do="" this="" task.="" to="" do="" this="" you="" will="" complete="" the="" function="" split_text="" as="" described="" by="" the="" docstring="" in="" the="">
Answered Same DayDec 01, 2021

Answer To: import os.path import math def clean_up(s): """ Return a version of string str in which all letters...

Ximi answered on Dec 04 2021
139 Votes
import os.path
import math
def clean_up(s):
""" Return a version of string str in which all letters have been
converted to lowercase and punctuation characters have been stripped
from both ends. Inner punctuation is left untouche
d. """
punctuation = '''!"',;:.-?)([]<>*#\n\t\r'''
result = s.lower().strip(punctuation)
return result
def average_word_length(line_list):
''' Return the average length of all words in line_list.
Do not include surrounding punctuation in words.
text is a non-empty list of strings each ending in \n.
At least one line in text contains a word.'''

# To do: Replace this function's body to meet its specification.
words_length = [
len(clean_up(word)) for line in line_list for word in line.strip('\n').split()
]
avg_word_len = sum(words_length)/len(words_length)
# print (avg_word_len)
return avg_word_len
def unique_words_ratio(line_list):
''' Return the type token ratio (TTR) for this line_list.
TTR is the number of different words divided by the total number of words.
text is a non-empty list of strings each ending in \n.
At least one line in text contains a word. '''
# To do: Replace this function's body to meet its specification.
words_all = [
clean_up(word) for line in line_list for word in line.strip('\n').split()
]
words_unique = set(words_all)
uniq_words_ratio = len(words_unique)/len(words_all)
# print (uniq_words_ratio)
return uniq_words_ratio
def hapax_legomana_ratio(line_list):
''' Return the hapax_legomana ratio for this text.
This ratio is the number of words that occur exactly once divided
by the total number of words.
text is a list of strings each ending in \n.
At least one line in text contains a word.'''

# To do: Replace this function's body to meet its specification.
words_all = [
word for line in line_list for word in line.strip('\n').split()
]
words_unique = set(words_all)
ratio = len(words_unique)/len(words_all)
# print (ratio)
return ratio
def split_text(original, separators):
'''Return a list of non-empty, non-blank strings from the original string
determined by splitting the string on any of the separators.
separators is a string of single-character separators.'''
# To do: Complete this function's body to meet its...
SOLUTION.PDF

Answer To This Question Is Available To Download

Related Questions & Answers

More Questions »

Submit New Assignment

Copy and Paste Your Assignment Here