# Practical 1: Stopword Removal import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize file = open("C:\\Users\\SATISH\\3D Objects\\IR\\textFile1.txt") line = file.read() print("Actual sentence: ", line) stop = set(stopwords.words('english')) token = word_tokenize(line) a = '' b = '' for w in token: if w not in stop: a = a + w + ' ' else: b = b + w + ' ' file2 = open("C:\\Users\\SATISH\\3D Objects\\IR\\textFile1.txt", "w") file2.write(a) print("Stopwords removal: ", a) print("Actual Stopwords: ", b) # Practical 2: Incidence Matrix import pandas as pd from sklearn.feature_extraction.text import CountVectorizer import numpy as np docs = ['why hello why there', 'omg hello pony', 'she went there omg'] # file1=open(r"D:\TYBSC SEM6\P3 Information Retrieval\Practice Practical\Incidence Matrix\file1.txt") # a1=file1.read() # docs.append(a1) vec = CountVectorizer() x = vec.fit_transform(docs) # print(x) # print(x.toarray()) df = pd.DataFrame(x.toarray(), columns = vec.get_feature_names()) # print(df) query = input("Enter the query: ") list2 = query.split() list3 = [] list4 = [] for i in range(len(list2)): if i % 2 == 0: list3.append(list2[i]) else: list4.append(list2[i]) print(list3) print(list4) x = [] print(df.shape[0]) print(df.shape[1]) for i in range(df.shape[0]-1): for j in range(df.shape[1]): if(df.loc[i][j]>1): df.loc[i][j] = 1 print(df) k = list4[0] if(k=='&'): ans = np.bitwise_and(df.loc[:,list3[0]],df.loc[:,list3[1]]) if(k=='|'): ans = np.bitwise_or(df.loc[:,list3[0]],df.loc[:,list3[1]]) l = list4[1] if(l=='&'): ans = np.bitwise_and(df.loc[:,list3[0]],ans) if(l=='|'): ans = np.bitwise_or(df.loc[:,list3[0]],ans) ans1 = [] ans1.append(ans) ans2 = list(ans1) list6 = [] for i in range(len(ans1)): ans1[i] = list(ans1[i]) print(ans1) count = 0 for i in range(len(ans1[0])): if(ans1[0][i] == 1): count += 1 print("Present in Document Number : ",i) print("Total documents: ", count)