Sale!

Homework 1 Python Questions

$30.00

‘Homework 1 Python Questions

Implement the following functions.

Do not add any more import lines to this file than the ones
already here without asking for permission on Piazza.
Use the regular expression tools built into Python; do NOT use bash.
”’

import re

def check_for_foo_or_bar(text):
”’Checks whether the input string meets the following condition.

The string must have both the word ‘foo’ and the word ‘bar’ in it,
whitespace- or punctuation-delimited from other words.
(not, e.g., words like ‘foobar’ or ‘bart’ that merely contain
the word ‘bar’);

Category:

Description

5/5 - (3 votes)

Homework 1 Python Questions

Implement the following functions.

Do not add any more import lines to this file than the ones
already here without asking for permission on Piazza.
Use the regular expression tools built into Python; do NOT use bash.
”’

import re

def check_for_foo_or_bar(text):
”’Checks whether the input string meets the following condition.

The string must have both the word ‘foo’ and the word ‘bar’ in it,
whitespace- or punctuation-delimited from other words.
(not, e.g., words like ‘foobar’ or ‘bart’ that merely contain
the word ‘bar’);

See the Python regular expression documentation:
https://docs.python.org/3.4/library/re.html#match-objects

Return:
True if the condition is met, false otherwise.
”’

if re.match(r’.*\bfoo\b.*\bbar\b.*’, text) or re.match(r’.*\bbar\b.*\bfoo\b.*’, text):
return True
else:
return False

def replace_rgb(text):
”’Replaces all RGB or hex colors with the word ‘COLOR’

Possible formats for a color string:
#0f0
#0b013a
#37EfaA
rgb(1, 1, 1)
rgb(255,19,32)
rgb(00,01, 18)
rgb(0.1, 0.5,1.0)

There is no need to try to recognize rgba or other formats not listed
above. There is also no need to validate the ranges of the rgb values.

However, you should make sure all numbers are indeed valid numbers.
For example, ‘#xyzxyz’ should return false as these are not valid hex digits.
Similarly, ‘rgb(c00l, 255, 255)’ should return false.

Only replace matching colors which are at the beginning or end of the line,
or are space separated from the text around them. For example, due to the
trailing period:

‘I like rgb(1, 2, 3) and rgb(2, 3, 4).’ becomes ‘I like COLOR and rgb(2, 3, 4).’

# See the Python regular expression documentation:
https://docs.python.org/3.4/library/re.html#re.sub

Returns:
The text with all RGB or hex colors replaces with the word ‘COLOR’
”’

h = ‘([0-9A-Fa-f]{6}|[0-9A-Fa-f]{3})’
d = ‘\d+(\.)?(\d+)?’
color = ‘(#’ + h + ‘)|’ + ‘(rgb\(‘ + d + ‘,\s*’ + d + ‘,\s*’ + d + ‘\))’
color_p = ‘(^|(?<=\s))(‘ + color + ‘)((?=\s)|$)’

result = re.sub(color_p, “COLOR”, text)
return result

def edit_distance(str1, str2):
”’Computes the minimum edit distance between the two strings.

Use a cost of 1 for all operations.

See Section 2.4 in Jurafsky and Martin for algorithm details.
Do NOT use recursion.

Returns:
An integer representing the string edit distance
between str1 and str2
”’

m, n = len(str1), len(str2)
ans = [[0 for i in range(n + 1)] for j in range(m + 1)]
for i in range(m + 1):
ans[i][n] = m – i
for i in range(n + 1):
ans[m][i] = n – i
m -= 1
n -= 1
while m >= 0:
t = n
while t >= 0:
if str1[m] == str2[t]:
ans[m][t] = ans[m + 1][t + 1]
else:
ans[m][t] = min(ans[m][t + 1], ans[m + 1][t], ans[m + 1][t + 1]) + 1
t -= 1
m -= 1
return ans[0][0]

def wine_text_processing(wine_file_path, stopwords_file_path):
”’Process the two files to answer the following questions and output results to stdout.

1. What is the distribution over star ratings?
2. What are the 10 most common words used across all of the reviews, and how many times
is each used?
3. How many times does the word ‘a’ appear?
4. How many times does the word ‘fruit’ appear?
5. How many times does the word ‘mineral’ appear?
6. Common words (like ‘a’) are not as interesting as uncommon words (like ‘mineral’).
In natural language processing, we call these common words “stop words” and often
remove them before we process text. stopwords.txt gives you a list of some very
common words. Remove these stopwords from your reviews. Also, try converting all the
words to lower case (since we probably don’t want to count ‘fruit’ and ‘Fruit’ as two
different words). Now what are the 10 most common words across all of the reviews,
and how many times is each used?
7. You should continue to use the preprocessed reviews for the following questions
(lower-cased, no stopwords). What are the 10 most used words among the 5 star
reviews, and how many times is each used?
8. What are the 10 most used words among the 1 star reviews, and how many times is
each used?
9. Gather two sets of reviews: 1) Those that use the word “red” and 2) those that use the word
“white”. What are the 10 most frequent words in the “red” reviews which do NOT appear in the
“white” reviews?
10. What are the 10 most frequent words in the “white” reviews which do NOT appear in the “red”
reviews?

No return value.
”’

word_dic = {}
star_dic = {}

with open(wine_file_path) as wf:
for line in wf:
words = line.split(‘ ‘)
star = re.sub(r’\n’, ”, re.sub(r’.*\t’, ”, words[-1]))

if star not in star_dic.keys():
star_dic[star] = 0
star_dic[star] += 1

words[-1] = re.sub(r’\t.*\n’, ”, words[-1])
for word in words:
if word not in word_dic.keys():
word_dic[word] = 0
word_dic[word] += 1

for star, count in [(k,star_dic[k]) for k in sorted(star_dic.keys(), reverse = True)]:
print (star + ‘\t’ + str(count))
print(“\n”)

i = 0
for word, count in sorted(word_dic.items(), key = lambda item:item[1], reverse = True):
print (word + ‘\t’ + str(count))
i += 1
if i == 10:
break
print(“\n”)

print(str(word_dic[‘a’]) + “\n”)

print(str(word_dic[‘fruit’]) + “\n”)

print(str(word_dic[‘mineral’]) + “\n”)

stw = []
with open(stopwords_file_path) as sf:
for line in sf:
line = line.strip(‘\n’)
stw.append(line)

word_dic2 = {}
with open(wine_file_path) as wf:
for line in wf:
words = line.split(‘ ‘)
words[-1] = re.sub(r’\t.*\n’, ”, words[-1])
for word in words:
word = word.lower()
if word not in stw:
if word not in word_dic2.keys():
word_dic2[word] = 0
word_dic2[word] += 1

i = 0
for word, count in sorted(word_dic2.items(), key = lambda item:item[1], reverse = True):
print (word + ‘\t’ + str(count))
i += 1
if i == 10:
break
print(“\n”)

word_dic3 = {}
with open(wine_file_path) as wf:
for line in wf:
if re.match(r’.*\*{5}\n’, line):
words = line.split(‘ ‘)
words[-1] = re.sub(r’\t.*\n’, ”, words[-1])
for word in words:
word = word.lower()
if word not in stw:
if word not in word_dic3.keys():
word_dic3[word] = 0
word_dic3[word] += 1

i = 0
for word, count in sorted(word_dic3.items(), key = lambda item:item[1], reverse = True):
print (word + ‘\t’ + str(count))
i += 1
if i == 10:
break
print(“\n”)

word_dic4 = {}
with open(wine_file_path) as wf:
for line in wf:
if re.match(r’.*\t\*\n’, line):
words = line.split(‘ ‘)
words[-1] = re.sub(r’\t.*\n’, ”, words[-1])
for word in words:
word = word.lower()
if word not in stw:
if word not in word_dic4.keys():
word_dic4[word] = 0
word_dic4[word] += 1

i = 0
for word, count in sorted(word_dic4.items(), key = lambda item:item[1], reverse = True):
print (word + ‘\t’ + str(count))
i += 1
if i == 10:
break
print(“\n”)

word_dic_r = {}
word_dic_w = {}
with open(wine_file_path) as wf:
for line in wf:
flag_r = flag_w = False
words = line.split(‘ ‘)
words[-1] = re.sub(r’\t.*\n’, ”, words[-1])
for word in words:
word = word.lower()
if word == ‘red’:
flag_r = True
if word == ‘white’:
flag_w = True
if flag_r:
for word in words:
word = word.lower()
if word not in stw:
if word not in word_dic_r.keys():
word_dic_r[word] = 0
word_dic_r[word] += 1
if flag_w:
for word in words:
word = word.lower()
if word not in stw:
if word not in word_dic_w.keys():
word_dic_w[word] = 0
word_dic_w[word] += 1

i = 0
for word, count in sorted(word_dic_r.items(), key = lambda item:item[1], reverse = True):
if word not in word_dic_w.keys():
print (word + ‘\t’ + str(count))
i += 1
if i == 10:
break
print(“\n”)

i = 0
for word, count in sorted(word_dic_w.items(), key = lambda item:item[1], reverse = True):
if word not in word_dic_r.keys():
print (word + ‘\t’ + str(count))
i += 1
if i == 10:
break
print(“\n”)