Commit 69a0ea3b authored by Wickramasinghe R.J.P's avatar Wickramasinghe R.J.P

data pre-processing module updated

parent 6f7c3ac3
...@@ -3,23 +3,19 @@ import re ...@@ -3,23 +3,19 @@ import re
import pandas as pd import pandas as pd
if __name__ == '__main__': if __name__ == '__main__':
df = pd.read_csv('C:\\Users\\LENOVO\\Music\\New folder\\Ontology generation\\petmd.csv') df = pd.read_csv('petmd.csv')
df_col = df.columns df_col = df.columns
data_dict = { data_dict = {
'Disease': [], 'Disease': [],
'DiseaseDescription': [], 'DiseaseDescription': [],
'DiseaseCause': [],
'DiseaseCauseDescription': [], 'DiseaseCauseDescription': [],
'DiseaseDiagnose': [],
'DiseaseDiagnoseDescription': [], 'DiseaseDiagnoseDescription': [],
'DiseasePrevention': [],
'DiseasePreventionDescription': [], 'DiseasePreventionDescription': [],
'DiseaseSymptoms': [], 'DiseaseSymptomDescription': [],
'DiseaseSymptomsDescription': [],
'DiseaseTreatment': [],
'DiseaseTreatmentDescription': [], 'DiseaseTreatmentDescription': [],
'DiseaseInfection': []
} }
disease = '' disease = ''
...@@ -63,13 +59,11 @@ if __name__ == '__main__': ...@@ -63,13 +59,11 @@ if __name__ == '__main__':
topic = row[1][df_col[1]] topic = row[1][df_col[1]]
description = row[1][df_col[2]] description = row[1][df_col[2]]
if regexp.search(str(topic)): if regexp.search(str(topic)):
data_dict['DiseaseCause'].append(str(disease).strip().replace(" ", "") + 'Cause')
data_dict['DiseaseCauseDescription'].append(str(description)) data_dict['DiseaseCauseDescription'].append(str(description))
temp = 1 temp = 1
break break
if temp == 0: if temp == 0:
data_dict['DiseaseCause'].append('')
data_dict['DiseaseCauseDescription'].append('') data_dict['DiseaseCauseDescription'].append('')
temp = 0 temp = 0
...@@ -78,13 +72,11 @@ if __name__ == '__main__': ...@@ -78,13 +72,11 @@ if __name__ == '__main__':
topic = row[1][df_col[1]] topic = row[1][df_col[1]]
description = row[1][df_col[2]] description = row[1][df_col[2]]
if regexp.search(str(topic)): if regexp.search(str(topic)):
data_dict['DiseaseDiagnose'].append(str(disease).strip().replace(" ", "") + 'Diagnose')
data_dict['DiseaseDiagnoseDescription'].append(str(description)) data_dict['DiseaseDiagnoseDescription'].append(str(description))
temp = 1 temp = 1
break break
if temp == 0: if temp == 0:
data_dict['DiseaseDiagnose'].append('')
data_dict['DiseaseDiagnoseDescription'].append('') data_dict['DiseaseDiagnoseDescription'].append('')
temp = 0 temp = 0
...@@ -93,13 +85,11 @@ if __name__ == '__main__': ...@@ -93,13 +85,11 @@ if __name__ == '__main__':
topic = row[1][df_col[1]] topic = row[1][df_col[1]]
description = row[1][df_col[2]] description = row[1][df_col[2]]
if regexp.search(str(topic)): if regexp.search(str(topic)):
data_dict['DiseasePrevention'].append(str(disease).strip().replace(" ", "") + 'Prevention')
data_dict['DiseasePreventionDescription'].append(str(description)) data_dict['DiseasePreventionDescription'].append(str(description))
temp = 1 temp = 1
break break
if temp == 0: if temp == 0:
data_dict['DiseasePrevention'].append('')
data_dict['DiseasePreventionDescription'].append('') data_dict['DiseasePreventionDescription'].append('')
temp = 0 temp = 0
...@@ -108,14 +98,12 @@ if __name__ == '__main__': ...@@ -108,14 +98,12 @@ if __name__ == '__main__':
topic = row[1][df_col[1]] topic = row[1][df_col[1]]
description = row[1][df_col[2]] description = row[1][df_col[2]]
if regexp.search(str(topic)): if regexp.search(str(topic)):
data_dict['DiseaseSymptoms'].append(str(disease).strip().replace(" ", "") + 'Symptoms') data_dict['DiseaseSymptomDescription'].append(str(description))
data_dict['DiseaseSymptomsDescription'].append(str(description))
temp = 1 temp = 1
break break
if temp == 0: if temp == 0:
data_dict['DiseaseSymptoms'].append('') data_dict['DiseaseSymptomDescription'].append('')
data_dict['DiseaseSymptomsDescription'].append('')
temp = 0 temp = 0
for row in grouped_rows.iterrows(): for row in grouped_rows.iterrows():
...@@ -123,13 +111,47 @@ if __name__ == '__main__': ...@@ -123,13 +111,47 @@ if __name__ == '__main__':
topic = row[1][df_col[1]] topic = row[1][df_col[1]]
description = row[1][df_col[2]] description = row[1][df_col[2]]
if regexp.search(str(topic)): if regexp.search(str(topic)):
data_dict['DiseaseTreatment'].append(str(disease).strip().replace(" ", "") + 'Treatment')
data_dict['DiseaseTreatmentDescription'].append(str(description)) data_dict['DiseaseTreatmentDescription'].append(str(description))
temp = 1 temp = 1
break break
if temp == 0: if temp == 0:
data_dict['DiseaseTreatment'].append('')
data_dict['DiseaseTreatmentDescription'].append('') data_dict['DiseaseTreatmentDescription'].append('')
pd.DataFrame(data_dict).to_csv('dogDisease.csv', index=False) temp = 0
for row in grouped_rows.iterrows():
description = row[1][df_col[2]]
regexp = re.compile(r'Allerg|allerg')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Allergic')
temp = 1
break
regexp = re.compile(r'Bacter|bacter')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Bacterial')
temp = 1
break
regexp = re.compile(r'Flea|flea')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Fleas')
temp = 1
break
regexp = re.compile(r'Fung|fung')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Fungal')
temp = 1
break
regexp = re.compile(r'Virus|virus')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Viral')
temp = 1
break
if temp == 0:
data_dict['DiseaseInfection'].append('')
pd.DataFrame(data_dict).drop_duplicates().to_csv('dogDisease.csv', index=False)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment