Commit f8fbd2ac authored by A.Anne Negomi.Silva's avatar A.Anne Negomi.Silva

update with grouping

parents
This diff is collapsed.
Communication
Interpersonal
Leadership
Problem-solving
Time management
Flexibility/adaptability
Critical thinking
Organization
Creativity
Collaboration
Attention to detail
Dependability/responsibility
Self-motivated
Work ethic
Professionalism
Artistic aptitude
Creativity
Critical observation
Critical thinking
Curiosity
Design aptitude
Desire to learn
Flexibility
Innovation
Logical thinking
Problem-solving
Research
Resourcefulness
Thinking outside the box
Tolerance of change and uncertainty
Troubleshooting
Value education
Willingness to learn
Accuracy
Assertive
Conflict management
Decision making
Diplomatic
Ethical
Humble
Influential
Insightful
Intuitive
Listening
Patience
Perceptive
Practical
Realistic
Reflective
Teamwork
Accountable
Adaptability
Capable
Competence
Dynamic
Helpfulness
Honesty
Loyal
Punctual
Reliable
Responsible
Teachable
Trustworthy
Articulate
Attentiveness
Collaborative
Conscientiousness
Considerate
Empathy
Encouraging
Inclusive
Leadership
Listening
Management
Negotiation
Nonverbal communication
Persuasion
Professional
Relationship building
Respectful
Sense of humor
Sincere
Sociable
Storytelling
Teaching
Training
Understanding
Verbal communication
Relationship building
Storytelling
Ambition
Alertness
Amiability
Confidence
Dedication
Dependability
Determination
Energy
Hardworking
Independent
Life skills
Optimism
Positive
Resilience
Strong Work ethic
Productive
Enterprising
Visionary
Passion
Assertiveness
Compassionate
Effective communicator
Ethical
Functions well under pressure
Generosity
Good attitude
High Emotional Intelligence
Honest
Independent
Integrity
Interviewing
Knowledge management
Meets deadlines
Memory Skills
Motivating
Personal Development
Outgoing
Performance management
Positive work ethic
Process improvement
Quick-witted
Results-oriented
Self-awareness
Self-supervising
Stress management
Team player
Time Management
Tolerant
Trainable
Training
Troubleshooting
Willing to accept feedback
Willingness to learn
Works well under pressure
Active listening
Communication
Computer skills
Customer service
Interpersonal skills
Leadership
Management skills
Problem-solving
Time management
Transferable skills
Active listening
Empathy
Interpersonal skills
Problem-solving
Reliability
Communication
Empathy
Flexibility
Leadership
Patience
Ability to teach and mentor
Flexibility
Risk-taking
Team building
Time management
Decision-making
Project planning
Task delegation
Team communication
Team leadership
Attention to detail
Collaboration
Communication
Patience
Research
Delegating tasks
Focus
Goal setting
Organization
Prioritization
Ambition
Creativity
Empathy
Leadership
Teamwork
Creativity
Interpersonal Skills
Critical Thinking
Problem Solving
Public Speaking
Customer Service Skills
Teamwork Skills
Communication
Collaboration
Accounting
Active Listening
Adaptability
Negotiation
Conflict Resolution
Decision-making
Empathy
Customer Service
Decision Making
Management
Leadership skills
Organization
Language skills
Administrative skills
Emotional Intelligence
Attention to Detail
Responsibility
Computer Software and Application Knowledge
Design
Data Analysis
Negotiation
Mathematics
Project Management
Marketing
Administrative
Interpersonal
Attention to detail
Initiative
Management and organisational
Willingness to learn
Ability to handle pressure and meet deadlines
Flexibility
Written communication
Verbal communication
Motivation
Curiosity
Sense of responsibility
Accountability
Willingness to learn
Influencing
Desire to progress
Listening
Self-awareness
Compassion
Practicality
Resourcefulness
Adaptability
Loyalty
Motivated
Passionate
Flexibile
Organised
conscientious
self-starter
Goal Setting
Motivation
Dependability
Independence
Initiative
Work Ethic
Problem Solving
Loyalty
Teamworking
Friendliness
Cooperating
Listening
Following Directions
This diff is collapsed.
#!/usr/bin/env python
# coding: utf-8
# In[4]:
#import Libraries
import os
import codecs
import numpy as np
import re
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
# In[5]:
def remove_punctuation(data):
symbols = "!\"#$%&*+.:;<=>?@()[\]^_`{|}~,\n*|/"
for i in range(len(symbols)):
data = np.char.replace(data, symbols[i], ' ')
data = np.char.replace(data, " ", " ")
return str(data)
# In[6]:
def remove_stop_words(data):
stop_words = stopwords.words('english')
words = word_tokenize(str(data))
new_text = ""
for w in words:
if w not in stop_words:
new_text = new_text + " " + w
return new_text
# In[7]:
directory = 'E:\Research\monster\java-developer'
index = 1
documents = []
for root, dirnames, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith('.txt'):
fname = os.path.join(root, filename)
#print('Filename: {}'.format(fname))
f = codecs.open(fname, 'r', 'utf-8')
page = f.read()
#document = BeautifulSoup(page).get_text()
whitespace_removed = re.sub(r'[\t\n\r]', ' ', page)
pun_removeddata = remove_punctuation(whitespace_removed)
remove_stopword = remove_stop_words(pun_removeddata)
#pun_removeddata.translate(str.maketrans('', '', ' \n\t\r'))
#removed single character = re.sub('\s+',' ',clndata).strip()
#clndata = ' '.join( [w for w in main_content_string.split() if len(w)>1] )
#clndata
documents.append(remove_stopword)
documents[0]
# In[15]:
with open(r"E:\Research\programming language.txt") as f:
pglanguages = [line.rstrip() for line in f]
len(pglanguages)
# In[17]:
with open(r"E:\Research\Education skills.txt") as f:
edu_Skills = [line.rstrip() for line in f]
len(edu_Skills)
# In[18]:
with open(r"E:\Research\Skills.txt") as f:
Skills = [line.rstrip() for line in f]
len(Skills)
# In[13]:
Education_skills = []
for document in documents:
tokens = word_tokenize(document)
education_skills = []
for edu_Skill in edu_Skills:
for token in tokens:
if edu_Skill.upper() == token.upper():
education_skills.append(edu_Skill)
#print(Skill)
#main.append(','.join(str(c) for c in sub))
final_sub = list(dict.fromkeys(education_skills))
#print(final_sub)
Education_skills.append(','.join(str(c) for c in final_sub))
len(Education_skills)
# In[19]:
regexp_eng = '\w+\s+Engineer'
#regexp_exp = '(\d+\-|\/\d+|\d+ years experience)' #'((\d+(\-|\/)\d+|\d+) years experience)'
#regexp_exp = '((\d+\-|\/\d+|\d+) years experience)|(\d+\-|\/\d+|\d+ experience)'
regexp_exp = '((\d+(\-|\/)\d+|\d+) years experience)|((\d+(\-|\/)\d+|\d+) experience)'
Engineer = []
Languages= []
Experince= []
extra_skills = []
Education_skills = []
for document in documents:
engineering = re.findall(regexp_eng, document)
if engineering == []:
Engineer.append("None")
else:
Engineer.append(engineering[0])
len(Engineer)
experince = re.findall(regexp_exp, document)
if experince == []:
Experince.append("None")
else:
Experince.append(experince[0][0])
sub_language=[]
#print(pglanguage)
tokens = word_tokenize(document)
for pglanguage in pglanguages:
for token in tokens:
if pglanguage.upper() == token.upper():
sub_language.append(pglanguage)
#main.append(','.join(str(c) for c in sub))
final_sub = list(dict.fromkeys(sub_language))
Languages.append(','.join(str(c) for c in final_sub))
#len(Languages)
skills = []
for Skill in Skills:
for token in tokens:
if Skill.upper() == token.upper():
skills.append(Skill)
#print(Skill)
#main.append(','.join(str(c) for c in sub))
final_skill = list(dict.fromkeys(skills))
#print(final_skill)
extra_skills.append(','.join(str(c) for c in final_skill))
education_skills = []
for edu_Skill in edu_Skills:
for token in tokens:
if edu_Skill.upper() == token.upper():
education_skills.append(edu_Skill)
#print(Skill)
#main.append(','.join(str(c) for c in sub))
final_edu = list(dict.fromkeys(education_skills))
#print(final_sub)
Education_skills.append(','.join(str(c) for c in final_edu))
df_Engineer = pd.DataFrame({'Job Role':Engineer})
df_Languages = pd.DataFrame({'Programining Languages':Languages})
df_Experince = pd.DataFrame({'Experince':Experince})
df_Skills = pd.DataFrame({'Skills':extra_skills})
df_Education = pd.DataFrame({'Education Skills':Education_skills})
print (df_Engineer)
print (df_Languages)
print (df_Experince)
print (df_Skills)
print (df_Education)
# In[23]:
df = pd.concat([df_Engineer, df_Languages, df_Experince,df_Skills], axis=1)
# In[67]:
#len(Engineer)
#df
newdf = df[df['Job Role'] != 'None']
newdf = df[df['Job Role'] == 'Software Engineer']
newdf
# In[68]:
newdf.to_excel (r'E:\Research\data1.xlsx', header=True,index=False)
# In[25]:
# For each row, combine all the columns into one column
df2 = newdf.apply(lambda x: ','.join(x.astype(str)), axis=1)
df2
# In[26]:
df_clean = pd.DataFrame({'clean': df2})
df_clean
# In[27]:
# Create the list of list format of the custom corpus for gensim modeling
sent = [row.split(',') for row in df_clean['clean']]
#show the example of list of list format of the custom corpus for gensim modeling
#sent[:2]
sent
# In[29]:
#train the genism word2vec model with our own custom corpus
from gensim import models
from gensim.models import Word2Vec
model = Word2Vec(sent, min_count=1,size= 50,workers=3, window =3, sg = 1)
#cal_elapsed_time(time.time())
# In[30]:
model['Software Engineer']
# In[31]:
model.similarity('Software Engineer', 'Java')
# In[32]:
model.most_similar('Software Engineer')[:5]
# In[51]:
# only get the unique Jobs
Jobs = list(newdf['Job Role'].unique())
#Jobs
languages = list(newdf['Programining Languages'].unique())
languages = ','.join(str(c) for c in languages)
languages = languages.split(',')
languages = set(languages)
languages
len(languages)
# In[61]:
#cosine distance function
import numpy as np
from numpy import dot
from numpy.linalg import norm
def cosine_distance (model, word,target_list , num) :
cosine_dict ={}
word_list = []
a = model[word]
for item in target_list :
if item != word :
b = model [item]
cos_sim = dot(a, b)/(norm(a)*norm(b))
cosine_dict[item] = cos_sim
dist_sort=sorted(cosine_dict.items(), key=lambda dist: dist[1],reverse = True) ## in Descedning order
for item in dist_sort:
word_list.append((word,item[0], item[1]*100))
return word_list[0:num]
# In[62]:
# Show the most similar Mercedes-Benz SLK-Class by cosine distance
Software_Engineer_lang = cosine_distance (model,'Software Engineer',languages,119)
# In[63]:
Software_Engineer_lang
# In[64]:
Software_Engineer_language = pd.DataFrame(Software_Engineer_lang,columns=["Engineering", "Language", "score"])
# In[65]:
Software_Engineer_language
# In[66]:
Software_Engineer_language.to_excel (r'E:\Research\Software_Engineer_language.xlsx', header=True,index=False)
# In[ ]:
# In[ ]:
# In[ ]:
main= []
for document in documents[:20]:
sub=[]
#print(pglanguage)
tokens = word_tokenize(document)
for pglanguage in pglanguages:
for token in tokens:
if pglanguage.upper() == token.upper():
sub.append(pglanguage)
#main.append(','.join(str(c) for c in sub))
final_sub = list(dict.fromkeys(sub))
main.append(final_sub)
# In[ ]:
main
# In[ ]:
regexp_exp = '(\d+\-|\/\d+|\d+ years experience)'
Experince= []
for document in documents:
experince = re.findall(regexp_exp, document)
if experince == []:
Experince.append("None")
else:
Experince.append(experince[0])
Experince
# In[ ]:
# In[ ]:
bigrams = ngrams(documents[0].split(), 1)
for grams in bigrams:
strTest = ' '.join(grams)
#print(grams)
print(strTest)
# In[ ]:
def df(cont):
n=2
df=[]
bigrams = ngrams(contents.split(), n)
Found = ""
for key in keylist:
grams = ()
bigrams = ngrams(contents.split(), n)
if Found == 'F':
df.append('NO')
for grams in bigrams:
strTest = ' '.join(grams)
if key in strTest.upper():
df.append('YES')
df_new = pd.DataFrame(df)
d1=df_new.transpose()
d1.columns=keylist
d1.index = [name_of_org]
print("languages")
return d1
# In[ ]:
document2 = ' '.join(str(c) for c in documents)
regexp_eng = '\w+\s+Engineer'
engineering = re.findall(regexp_eng, document2)
engineering
# In[ ]:
clndata = remove_punctuation(document1[0])
main_content_string = re.sub('\s+',' ',clndata).strip()
clndata = ' '.join( [w for w in main_content_string.split() if len(w)>1] )
clndata
# In[ ]:
from nltk import ngrams
for document in documents:
# In[ ]:
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Scala
JavaScript
Java
PHP
Swift
Python
\ No newline at end of file
# -*- coding: utf-8 -*-
"""vector.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1c5MqjdkRr1GZ2Ji6ILd3NI2i5ff3Ayip
"""
# Commented out IPython magic to ensure Python compatibility.
import nltk
from nltk import word_tokenize , sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
# %matplotlib inline
nltk.download('punkt')
nltk.download('gutenberg')
from nltk.corpus import gutenberg
gutenberg.fileids()
len(gutenberg.fileids())
sent_tokenize(gutenberg.raw())[1]
word_tokenize(sent_tokenize(gutenberg.raw())[1])
len(gutenberg.words())
input_data=gutenberg.sents()
"""Execute Word2Vec Model"""
model=Word2Vec(sentences=input_data , size=64, sg=1 , window=10 , min_count=5, seed= 123)
len(model.wv.vocab)
model.wv['king']
"""Explore the Word2Vec Model
"""
model.wv.most_similar('java')
model.wv.most_similar('work')
model.wv.most_similar('study')
model.wv.most_similar('day')
model.wv.most_similar('January')
model.wv.doesnt_match(' November August Sunday'.split())
model.wv.similarity('father','mother')
model.wv.similarity('father','queen')
model.wv.similarity('father','king')
model.wv.similarity('study','book')
model.wv.similarity('study','food')
model.wv.similarity('king','queen')
model.wv.most_similar(positive=['son','woman'],negative=['man'])
X=model.wv[model.wv.vocab]
tsne=TSNE(n_components=2 , n_iter= 250)
x_2d=tsne.fit_transform(X)
wv_df=pd.DataFrame(x_2d,columns=['x','y'])
wv_df['token']=model.wv.vocab.keys()
wv_df.head()
wv_df.plot.scatter('x','y' , figsize=(12,12), marker = '.' , s=10 , alpha= 0.2)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment