update with grouping

f8fbd2ac · A.Anne Negomi.Silva · f8fbd2ac · f8fbd2ac · f8fbd2ac · f8fbd2ac
Commit f8fbd2ac authored May 05, 2021 by A.Anne Negomi.Silva
9 changed files
--- a/Education skills.txt
+++ b/Education skills.txt
--- a/Skills.txt
+++ b/Skills.txt
+Communication
+Interpersonal
+Leadership
+Problem-solving
+Time management
+Flexibility/adaptability
+Critical thinking
+Organization
+Creativity
+Collaboration
+Attention to detail
+Dependability/responsibility
+Self-motivated
+Work ethic
+Professionalism
+Artistic aptitude
+Creativity
+Critical observation
+Critical thinking
+Curiosity
+Design aptitude
+Desire to learn
+Flexibility
+Innovation
+Logical thinking
+Problem-solving
+Research
+Resourcefulness
+Thinking outside the box
+Tolerance of change and uncertainty
+Troubleshooting
+Value education
+Willingness to learn
+Accuracy
+Assertive
+Conflict management
+Decision making
+Diplomatic
+Ethical
+Humble
+Influential
+Insightful
+Intuitive
+Listening
+Patience
+Perceptive
+Practical
+Realistic
+Reflective
+Teamwork
+Accountable
+Adaptability
+Capable
+Competence
+Dynamic
+Helpfulness
+Honesty
+Loyal
+Punctual
+Reliable
+Responsible
+Teachable
+Trustworthy
+Articulate
+Attentiveness
+Collaborative
+Conscientiousness
+Considerate
+Empathy
+Encouraging
+Inclusive
+Leadership
+Listening
+Management
+Negotiation
+Nonverbal communication
+Persuasion
+Professional
+Relationship building
+Respectful
+Sense of humor
+Sincere
+Sociable
+Storytelling
+Teaching
+Training
+Understanding
+Verbal communication
+Relationship building
+Storytelling
+Ambition
+Alertness
+Amiability
+Confidence
+Dedication
+Dependability
+Determination
+Energy
+Hardworking
+Independent
+Life skills
+Optimism
+Positive
+Resilience
+Strong Work ethic
+Productive
+Enterprising
+Visionary
+Passion
+Assertiveness
+Compassionate
+Effective communicator
+Ethical
+Functions well under pressure
+Generosity
+Good attitude
+High Emotional Intelligence
+ Honest
+Independent
+Integrity
+Interviewing
+Knowledge management
+Meets deadlines
+Memory Skills
+Motivating
+Personal Development
+Outgoing
+Performance management
+Positive work ethic
+Process improvement
+Quick-witted
+Results-oriented
+Self-awareness
+Self-supervising
+Stress management
+Team player
+Time Management
+Tolerant
+Trainable
+Training
+Troubleshooting
+Willing to accept feedback
+Willingness to learn
+Works well under pressure
+Active listening
+Communication
+Computer skills
+Customer service
+Interpersonal skills
+Leadership
+Management skills
+Problem-solving
+Time management
+Transferable skills
+Active listening
+Empathy
+Interpersonal skills
+Problem-solving
+Reliability
+Communication
+Empathy
+Flexibility
+Leadership
+Patience
+Ability to teach and mentor
+Flexibility
+Risk-taking
+Team building
+Time management
+Decision-making
+Project planning
+Task delegation
+Team communication
+Team leadership
+Attention to detail
+Collaboration
+Communication
+Patience
+Research
+Delegating tasks
+Focus
+Goal setting
+Organization
+Prioritization
+Ambition
+Creativity
+Empathy
+Leadership
+Teamwork
+Creativity
+Interpersonal Skills
+Critical Thinking
+Problem Solving
+Public Speaking
+Customer Service Skills
+Teamwork Skills
+Communication
+Collaboration
+Accounting
+Active Listening
+Adaptability
+Negotiation
+Conflict Resolution
+Decision-making
+Empathy
+Customer Service
+Decision Making
+Management
+Leadership skills
+Organization
+Language skills
+Administrative skills
+Emotional Intelligence
+Attention to Detail
+Responsibility
+Computer Software and Application Knowledge
+Design
+Data Analysis
+Negotiation
+Mathematics
+Project Management
+Marketing
+Administrative
+Interpersonal 
+Attention to detail
+Initiative
+Management and organisational
+Willingness to learn
+Ability to handle pressure and meet deadlines
+Flexibility
+Written communication
+Verbal communication
+Motivation
+Curiosity
+Sense of responsibility
+Accountability
+Willingness to learn
+Influencing
+Desire to progress
+Listening
+Self-awareness
+Compassion
+Practicality
+Resourcefulness
+Adaptability
+Loyalty
+Motivated
+Passionate
+Flexibile
+Organised
+conscientious
+self-starter
+Goal Setting
+Motivation
+Dependability
+Independence
+Initiative
+Work Ethic
+Problem Solving
+Loyalty
+Teamworking
+Friendliness
+Cooperating
+Listening
+Following Directions
--- a/final script.ipynb
+++ b/final script.ipynb
--- a/final script.py
+++ b/final script.py
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[4]:
+
+
+#import Libraries
+import os
+import codecs
+import numpy as np
+import re
+from nltk import ngrams
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import pandas as pd
+
+
+# In[5]:
+
+
+def remove_punctuation(data):
+    symbols = "!\"#$%&*+.:;<=>?@()[\]^_`{|}~,\n*|/"
+    for i in range(len(symbols)):
+        data = np.char.replace(data, symbols[i], ' ')
+        data = np.char.replace(data, "  ", " ")
+    return str(data)
+
+
+# In[6]:
+
+
+def remove_stop_words(data):
+    stop_words = stopwords.words('english')
+    words = word_tokenize(str(data))
+    new_text = ""
+    for w in words:
+        if w not in stop_words:
+            new_text = new_text + " " + w
+    return new_text
+
+
+# In[7]:
+
+
+directory = 'E:\Research\monster\java-developer'
+index = 1
+documents = []
+for root, dirnames, filenames in os.walk(directory):
+    for filename in filenames:
+        if filename.endswith('.txt'):
+            fname = os.path.join(root, filename)
+            #print('Filename: {}'.format(fname))
+            f = codecs.open(fname, 'r', 'utf-8')
+            page = f.read()
+            #document = BeautifulSoup(page).get_text()
+            whitespace_removed = re.sub(r'[\t\n\r]', ' ', page)
+            pun_removeddata = remove_punctuation(whitespace_removed)
+            remove_stopword = remove_stop_words(pun_removeddata)
+            #pun_removeddata.translate(str.maketrans('', '', ' \n\t\r'))
+            #removed single character = re.sub('\s+',' ',clndata).strip()
+            #clndata = ' '.join( [w for w in main_content_string.split() if len(w)>1] )
+            #clndata
+            documents.append(remove_stopword) 
+documents[0]
+
+
+# In[15]:
+
+
+with open(r"E:\Research\programming language.txt") as f:
+    pglanguages = [line.rstrip() for line in f]
+len(pglanguages)
+
+
+# In[17]:
+
+
+with open(r"E:\Research\Education skills.txt") as f:
+    edu_Skills = [line.rstrip() for line in f]
+len(edu_Skills)
+
+
+# In[18]:
+
+
+with open(r"E:\Research\Skills.txt") as f:
+    Skills = [line.rstrip() for line in f]
+len(Skills)
+
+
+# In[13]:
+
+
+Education_skills = []
+for document in documents:
+    tokens = word_tokenize(document)
+    education_skills = []
+    for edu_Skill in edu_Skills:
+        for token in tokens:
+            if edu_Skill.upper() == token.upper():
+                education_skills.append(edu_Skill)
+                #print(Skill)
+        #main.append(','.join(str(c) for c in sub))
+    final_sub = list(dict.fromkeys(education_skills))
+    #print(final_sub)
+    Education_skills.append(','.join(str(c) for c in final_sub))
+
+len(Education_skills)
+
+
+# In[19]:
+
+
+regexp_eng = '\w+\s+Engineer'
+#regexp_exp = '(\d+\-|\/\d+|\d+ years experience)' #'((\d+(\-|\/)\d+|\d+) years experience)'
+#regexp_exp = '((\d+\-|\/\d+|\d+) years experience)|(\d+\-|\/\d+|\d+ experience)'
+regexp_exp = '((\d+(\-|\/)\d+|\d+) years experience)|((\d+(\-|\/)\d+|\d+) experience)'
+Engineer = []
+Languages= []
+Experince= []
+extra_skills = []
+Education_skills = []
+for document in documents:
+    engineering = re.findall(regexp_eng, document)
+    if engineering == []:
+        Engineer.append("None")
+    else:
+        Engineer.append(engineering[0])
+        
+    len(Engineer)
+    
+    experince = re.findall(regexp_exp, document)
+    if experince == []:
+        Experince.append("None")
+    else:
+        Experince.append(experince[0][0])
+        
+    
+    sub_language=[]
+    #print(pglanguage)
+    tokens = word_tokenize(document)
+    for pglanguage in pglanguages:
+        for token in tokens:
+            if pglanguage.upper() == token.upper():
+                sub_language.append(pglanguage)
+    #main.append(','.join(str(c) for c in sub))
+    final_sub = list(dict.fromkeys(sub_language))
+    Languages.append(','.join(str(c) for c in final_sub))
+    #len(Languages)
+    
+    skills = []
+    for Skill in Skills:
+        for token in tokens:
+            if Skill.upper() == token.upper():
+                skills.append(Skill)
+                #print(Skill)
+    #main.append(','.join(str(c) for c in sub))
+    final_skill = list(dict.fromkeys(skills))
+    #print(final_skill)
+    extra_skills.append(','.join(str(c) for c in final_skill))
+    
+    education_skills = []
+    for edu_Skill in edu_Skills:
+        for token in tokens:
+            if edu_Skill.upper() == token.upper():
+                education_skills.append(edu_Skill)
+                #print(Skill)
+        #main.append(','.join(str(c) for c in sub))
+    final_edu = list(dict.fromkeys(education_skills))
+    #print(final_sub)
+    Education_skills.append(','.join(str(c) for c in final_edu))
+    
+    df_Engineer = pd.DataFrame({'Job Role':Engineer})
+    df_Languages = pd.DataFrame({'Programining Languages':Languages})
+    df_Experince = pd.DataFrame({'Experince':Experince})
+    df_Skills = pd.DataFrame({'Skills':extra_skills})
+    df_Education = pd.DataFrame({'Education Skills':Education_skills})
+
+print (df_Engineer)
+print (df_Languages)
+print (df_Experince)
+print (df_Skills)
+print (df_Education)
+
+
+# In[23]:
+
+
+df = pd.concat([df_Engineer, df_Languages, df_Experince,df_Skills], axis=1)
+
+
+# In[67]:
+
+
+#len(Engineer)
+#df
+newdf = df[df['Job Role'] != 'None']
+newdf = df[df['Job Role'] == 'Software Engineer']
+newdf
+
+
+# In[68]:
+
+
+newdf.to_excel (r'E:\Research\data1.xlsx', header=True,index=False)
+
+
+# In[25]:
+
+
+# For each row, combine all the columns into one column
+df2 = newdf.apply(lambda x: ','.join(x.astype(str)), axis=1)
+df2
+
+
+# In[26]:
+
+
+df_clean = pd.DataFrame({'clean': df2})
+df_clean
+
+
+# In[27]:
+
+
+# Create the list of list format of the custom corpus for gensim modeling 
+sent = [row.split(',') for row in df_clean['clean']]
+#show the example of list of list format of the custom corpus for gensim modeling 
+#sent[:2]
+sent
+
+
+# In[29]:
+
+
+#train the genism word2vec model with our own custom corpus
+from gensim import models
+from gensim.models import Word2Vec
+model = Word2Vec(sent, min_count=1,size= 50,workers=3, window =3, sg = 1)
+#cal_elapsed_time(time.time())
+
+
+# In[30]:
+
+
+model['Software Engineer']
+
+
+# In[31]:
+
+
+model.similarity('Software Engineer', 'Java')
+
+
+# In[32]:
+
+
+model.most_similar('Software Engineer')[:5]
+
+
+# In[51]:
+
+
+# only get the unique Jobs
+Jobs = list(newdf['Job Role'].unique()) 
+#Jobs
+languages = list(newdf['Programining Languages'].unique()) 
+languages = ','.join(str(c) for c in languages)
+languages = languages.split(',')
+languages = set(languages)
+languages
+len(languages)
+
+
+# In[61]:
+
+
+#cosine distance function
+import numpy as np
+from numpy import dot
+from numpy.linalg import norm
+def cosine_distance (model, word,target_list , num) :
+    cosine_dict ={}
+    word_list = []
+    a = model[word]
+    for item in target_list :
+        if item != word :
+            b = model [item]
+            cos_sim = dot(a, b)/(norm(a)*norm(b))
+            cosine_dict[item] = cos_sim
+    dist_sort=sorted(cosine_dict.items(), key=lambda dist: dist[1],reverse = True) ## in Descedning order 
+    for item in dist_sort:
+        word_list.append((word,item[0], item[1]*100))
+    return word_list[0:num]
+
+
+# In[62]:
+
+
+# Show the most similar Mercedes-Benz SLK-Class by cosine distance 
+Software_Engineer_lang = cosine_distance (model,'Software Engineer',languages,119)
+
+
+# In[63]:
+
+
+Software_Engineer_lang
+
+
+# In[64]:
+
+
+Software_Engineer_language = pd.DataFrame(Software_Engineer_lang,columns=["Engineering", "Language", "score"])
+
+
+# In[65]:
+
+
+Software_Engineer_language
+
+
+# In[66]:
+
+
+Software_Engineer_language.to_excel (r'E:\Research\Software_Engineer_language.xlsx', header=True,index=False)
+
+
+# In[ ]:
+
+
+
+
+
+# In[ ]:
+
+
+
+
+
+# In[ ]:
+
+
+main= []
+for document in documents[:20]:
+    sub=[]
+    #print(pglanguage)
+    tokens = word_tokenize(document)
+    for pglanguage in pglanguages:
+        for token in tokens:
+            if pglanguage.upper() == token.upper():
+                sub.append(pglanguage)
+    #main.append(','.join(str(c) for c in sub))
+    final_sub = list(dict.fromkeys(sub))
+    main.append(final_sub)
+
+
+# In[ ]:
+
+
+main
+
+
+# In[ ]:
+
+
+regexp_exp = '(\d+\-|\/\d+|\d+ years experience)'
+Experince= []
+for document in documents:
+    experince = re.findall(regexp_exp, document)
+    if experince == []:
+        Experince.append("None")
+    else:
+        Experince.append(experince[0])
+        
+Experince
+
+
+# In[ ]:
+
+
+
+
+
+# In[ ]:
+
+
+bigrams = ngrams(documents[0].split(), 1)
+for grams in bigrams:
+    strTest = ' '.join(grams) 
+    #print(grams)
+    print(strTest)
+
+
+# In[ ]:
+
+
+def df(cont):
+    n=2
+
+    df=[]
+
+    bigrams = ngrams(contents.split(), n)
+
+    Found = ""
+
+    for key in keylist:
+        grams = ()
+        bigrams = ngrams(contents.split(), n)
+        if Found == 'F':
+            df.append('NO')
+        for grams in bigrams:
+            strTest = ' '.join(grams) 
+            if key in strTest.upper():
+                df.append('YES')
+                
+    df_new = pd.DataFrame(df)
+    d1=df_new.transpose()
+    d1.columns=keylist
+    d1.index = [name_of_org]
+    print("languages")
+    return d1
+
+
+# In[ ]:
+
+
+document2 = ' '.join(str(c) for c in documents)
+regexp_eng = '\w+\s+Engineer'
+engineering = re.findall(regexp_eng, document2)
+engineering
+
+
+# In[ ]:
+
+
+
+
+
+clndata = remove_punctuation(document1[0])
+main_content_string = re.sub('\s+',' ',clndata).strip()
+clndata = ' '.join( [w for w in main_content_string.split() if len(w)>1] )
+clndata
+
+
+# In[ ]:
+
+
+from nltk import ngrams
+for document in documents:
+
+
+# In[ ]:
+
+
+
+
--- a/merg.txt
+++ b/merg.txt
--- a/merged-corpus.txt
+++ b/merged-corpus.txt
--- a/programming language.txt
+++ b/programming language.txt
--- a/programming language1.txt
+++ b/programming language1.txt
+Scala
+JavaScript
+Java
+PHP
+Swift
+Python
\ No newline at end of file
--- a/vector.py
+++ b/vector.py
+# -*- coding: utf-8 -*-
+"""vector.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1c5MqjdkRr1GZ2Ji6ILd3NI2i5ff3Ayip
+"""
+
+# Commented out IPython magic to ensure Python compatibility.
+import nltk
+from nltk import word_tokenize , sent_tokenize
+import gensim
+from gensim.models.word2vec import Word2Vec
+from sklearn.manifold import TSNE
+import pandas as pd
+from bokeh.io import output_notebook
+from bokeh.plotting import show, figure
+# %matplotlib inline
+
+nltk.download('punkt')
+
+nltk.download('gutenberg')
+
+from nltk.corpus import gutenberg
+
+gutenberg.fileids()
+
+len(gutenberg.fileids())
+
+sent_tokenize(gutenberg.raw())[1]
+
+word_tokenize(sent_tokenize(gutenberg.raw())[1])
+
+len(gutenberg.words())
+
+input_data=gutenberg.sents()
+
+"""Execute Word2Vec Model"""
+
+model=Word2Vec(sentences=input_data , size=64, sg=1 , window=10 , min_count=5, seed= 123)
+
+len(model.wv.vocab)
+
+model.wv['king']
+
+"""Explore the Word2Vec Model
+
+"""
+
+model.wv.most_similar('java')
+
+model.wv.most_similar('work')
+
+model.wv.most_similar('study')
+
+model.wv.most_similar('day')
+
+model.wv.most_similar('January')
+
+model.wv.doesnt_match(' November August Sunday'.split())
+
+model.wv.similarity('father','mother')
+
+model.wv.similarity('father','queen')
+
+model.wv.similarity('father','king')
+
+model.wv.similarity('study','book')
+
+model.wv.similarity('study','food')
+
+model.wv.similarity('king','queen')
+
+model.wv.most_similar(positive=['son','woman'],negative=['man'])
+
+X=model.wv[model.wv.vocab]
+
+tsne=TSNE(n_components=2 , n_iter= 250)
+
+x_2d=tsne.fit_transform(X)
+
+wv_df=pd.DataFrame(x_2d,columns=['x','y'])
+wv_df['token']=model.wv.vocab.keys()
+
+wv_df.head()
+
+wv_df.plot.scatter('x','y' , figsize=(12,12), marker = '.' , s=10 , alpha= 0.2)
\ No newline at end of file