Commit 05e607a1 authored by Wickramasinghe R.J.P's avatar Wickramasinghe R.J.P

changes added for ontology generation

parent c7ac0681
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="FacetManager">
<facet type="django" name="Django">
<configuration>
<option name="rootFolder" value="$MODULE_DIR$" />
<option name="settingsModule" value="djangoProject/settings.py" />
<option name="manageScript" value="$MODULE_DIR$/manage.py" />
<option name="environment" value="&lt;map/&gt;" />
<option name="doNotUseTestRunner" value="false" />
<option name="trackFilePattern" value="migrations" />
</configuration>
</facet>
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Django" />
<option name="TEMPLATE_FOLDERS">
<list>
<option value="$MODULE_DIR$/../djangoProject\templates" />
</list>
</option>
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
</list>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (djangoProject)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/CanisCare_Backend.iml" filepath="$PROJECT_DIR$/.idea/CanisCare_Backend.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
web gunicorn djangoProject.wsgi:application --log-file -
\ No newline at end of file
from django.contrib import admin
from .models import Note
admin.site.register(Note)
from django.apps import AppConfig
class ApiConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'api'
# Generated by Django 4.1 on 2022-08-25 06:14
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Note',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('body', models.TextField()),
('updated', models.DateTimeField(auto_now=True)),
('created', models.DateTimeField(auto_now_add=True)),
],
options={
'ordering': ['-updated'],
},
),
]
from django.db import models
class Note(models.Model):
body = models.TextField()
updated = models.DateTimeField(auto_now=True)
created = models.DateTimeField(auto_now_add=True)
def __str__(self):
return self.body[0:50]
class Meta:
ordering = ['-updated']
from rest_framework.serializers import ModelSerializer
from .models import Note
class NoteSerializer(ModelSerializer):
class Meta:
model = Note
fields = '__all__'
from django.test import TestCase
# Create your tests here.
from django.urls import path
from . import views
urlpatterns = [
path('', views.getRoutes),
path('diseasecategory/<str:pk>/', views.get_categorized_diseases),
path('diseases/', views.get_diseases),
path('disease/<str:pk>/', views.get_disease_description),
path('disease/<str:pk>/<str:pk2>/', views.get_disease_sub),
path('disease/<str:pk>/<str:pk2>/<str:pk3>/', views.get_disease_sub_info),
path('diseaselink/<str:pk>/', views.get_link),
]
from django.shortcuts import render
from rest_framework.decorators import api_view
from rest_framework.response import Response
from owlready2 import *
from rdflib import *
onto = get_ontology("dogDisease.owl").load(reload_if_newer=True)
graph = default_world.as_rdflib_graph()
@api_view(['GET'])
def getRoutes(request):
routes = [
{
'EndPoint': '/diseases/',
'method': 'GET',
'body': None,
'description': 'Returns a List of Diseases'
},
{
'EndPoint': '/diseasecategory/diseasename',
'method': 'GET',
'body': None,
'description': 'Returns Categorized Diseases '
},
{
'EndPoint': '/disease/diseasename/',
'method': 'GET',
'body': None,
'description': 'Get Description for the Disease'
},
{
'EndPoint': '/disease/diseasename/infotype/',
'method': 'GET',
'body': None,
'description': 'Get List of Individuals Relevant to Sub Topic'
},
{
'EndPoint': '/disease/diseasename/infotype/individual',
'method': 'GET',
'body': None,
'description': 'Get Relevant Information to Each Individual'
},
{
'EndPoint': '/diseaselink/diseasename',
'method': 'GET',
'body': None,
'description': 'Get SeeAlso Link for Disease'
}
]
return Response(routes)
@api_view(['GET'])
def get_disease_description(request, pk):
disease_description = list(graph.query(
"""
SELECT ?description
WHERE { ?disease <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#NamedDisease>;
FILTER(?disease = <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogDisease.owl#""" + pk + """>).
?disease <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#hasDescription> ?description.
}"""
))
new_disease_description = [str(x[0]) for x in disease_description]
new_disease_description = ' '.join(new_disease_description)
new_disease_description = new_disease_description.split('.')
new_disease_description = new_disease_description[0:-1]
return Response(new_disease_description)
@api_view(['GET'])
def get_disease_sub_info(request, pk, pk2, pk3):
new_pk2 = pk2[:-1]
new_pk3 = new_pk2
if new_pk3 == "Cause":
new_pk3 = "Casue"
sub_individual_list_info = list(graph.query(
"""
SELECT ?individualinfo
WHERE { ?disease <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#NamedDisease>;
FILTER((?disease = <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogDisease.owl#""" + pk + """>) && (?treat = <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogDisease.owl#""" + pk3 + """>)).
?disease <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#has""" + new_pk2 + """> ?treat.
?treat <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#has""" + new_pk3 + """Description> ?individualinfo.
}"""))
new_sub_individual_list_info = [str(x[0]) for x in sub_individual_list_info]
new_sub_individual_list_info = ' '.join(new_sub_individual_list_info)
new_sub_individual_list_info = new_sub_individual_list_info.split('.')
new_sub_individual_list_info = new_sub_individual_list_info[0:-1]
return Response(new_sub_individual_list_info)
@api_view(['GET'])
def get_disease_sub(request, pk, pk2):
new_pk2 = pk2[:-1]
sub_individual_list = list(graph.query(
"""
SELECT ?subindividual
WHERE { ?disease <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#NamedDisease>;
FILTER(?disease = <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogDisease.owl#""" + pk + """>).
?disease <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#has""" + new_pk2 + """> ?subindividual.
}"""))
new_sub_individual_list = [str(x[0]) for x in sub_individual_list]
fin_sub_list = []
for y in new_sub_individual_list:
itm = y[y.find('#'):][1:]
fin_sub_list.append(itm)
return Response(new_sub_individual_list)
@api_view(['GET'])
def get_diseases(request):
disease_list = list(graph.query("""
SELECT ?disease
WHERE {
?disease <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#NamedDisease> .
}"""))
new_disease_list = [str(x[0]) for x in disease_list]
fin_disease_list = []
for y in new_disease_list:
des = y[y.find('#'):][1:]
fin_disease_list.append(des)
return Response(fin_disease_list)
@api_view(['GET'])
def get_categorized_diseases(request, pk):
categorized_diseases = list(graph.query("""
SELECT ?s
WHERE { ?s <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease#hasInfection> <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogDisease.owl#""" + pk + """> }
"""))
new_categorized_diseases = [str(x[0]) for x in categorized_diseases]
fin_des_list = []
for y in new_categorized_diseases:
des = y[y.find('#'):][1:]
fin_des_list.append(des)
return Response(fin_des_list)
@api_view(['GET'])
def get_link(request, pk):
seeAlsoLink = list(graph.query("""
SELECT ?link
WHERE { <http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogDisease.owl#""" + pk + """> <http://www.w3.org/2000/01/rdf-schema#seeAlso> ?link }
"""))
new_link_list = [str(x[0]) for x in seeAlsoLink]
fin_link_list = []
for link in new_link_list:
fin_link_list.append(link)
return Response(fin_link_list)
"""
ASGI config for djangoProject project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.1/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'djangoProject.settings')
application = get_asgi_application()
"""
Django settings for djangoProject project.
Generated by 'django-admin startproject' using Django 4.1.
For more information on this file, see
https://docs.djangoproject.com/en/4.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.1/ref/settings/
"""
import os
from pathlib import Path
import django_heroku
import dj_database_url
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-3ts!3sf#jntwaa-9r(^t%7ty=nc03p_eil3yha%#hinc30b&ww'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = ['*']
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'api.apps.ApiConfig',
'rest_framework',
'corsheaders'
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
"corsheaders.middleware.CorsMiddleware",
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'djangoProject.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [BASE_DIR / 'templates']
,
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'djangoProject.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/4.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/4.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/4.1/howto/static-files/
STATIC_URL = 'static/'
STATIC_ROOT = os.path.join(BASE_DIR, 'staticfiles')
STATICFILES_DIRS = (os.path.join(BASE_DIR, 'static'),)
django_heroku.settings(locals())
# Default primary key field type
# https://docs.djangoproject.com/en/4.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
CORS_ALLOW_ALL_ORIGINS = True
from django.contrib import admin
from django.urls import path,include
urlpatterns = [
path('admin/', admin.site.urls),
path('', include('api.urls'))
]
"""
WSGI config for djangoProject project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'djangoProject.settings')
application = get_wsgi_application()
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
from owlready2 import *
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'djangoProject.settings')
# onto = get_ontology("C:\\Users\\LENOVO\\Documents\\Research\\dogSkinDisease.owl").load()
#
# default_world.set_backend(filename="disease.sqlite3")
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()
<!DOCTYPE html>
<html lang="en">
{#<link rel="stylesheet" href="{{ url_for('static', filename='style.css')}}">#}
<head>
<meta charset="UTF-8">
<title>Chatbot</title>
</head>
<body>
<div class="container">
<div class="chatbox">
<div class="chatbox__support">
<div class="chatbox__header">
<div class="chatbox__image--header">
<img src="https://img.icons8.com/color/48/000000/circled-user-female-skin-type-5--v1.png" alt="image">
</div>
<div class="chatbox__content--header">
<h4 class="chatbox__heading--header">Chat support</h4>
<p class="chatbox__description--header">Hi. My name is Sam. How can I help you?</p>
</div>
</div>
<div class="chatbox__messages">
<div></div>
</div>
<div class="chatbox__footer">
<input type="text" placeholder="Write a message...">
<button class="chatbox__send--footer send__button">Send</button>
</div>
</div>
<div class="chatbox__button">
{# <button><img src="{{ url_for('static', filename='images/chatbox-icon.svg') }}" /></button>#}
</div>
</div>
</div>
</body>
</html>
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
</list>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (buildOntology)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/buildOntology.iml" filepath="$PROJECT_DIR$/.idea/buildOntology.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<catalog prefer="public" xmlns="urn:oasis:names:tc:entity:xmlns:xml:catalog">
<group id="Folder Repository, directory=, recursive=true, Auto-Update=true, version=2" prefer="public" xml:base="">
<uri id="Automatically generated entry, Timestamp=1665432340207" name="http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogDisease.owl" uri="dogDisease.owl"/>
<uri id="Automatically generated entry, Timestamp=1665432340207" name="http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogSkinDisease" uri="dogSkinDisease.owl"/>
</group>
</catalog>
import re
import pandas as pd
if __name__ == '__main__':
df = pd.read_csv('petmd.csv')
link_df = pd.read_csv('link.csv')
df_col = df.columns
link_df_col = link_df.columns
data_dict = {
'Disease': [],
'DiseaseDescription': [],
'DiseaseCauseDescription': [],
'DiseaseDiagnoseDescription': [],
'DiseasePreventionDescription': [],
'DiseaseSymptomDescription': [],
'DiseaseTreatmentDescription': [],
'DiseaseInfection': [],
'DiseaseLink': []
}
disease = ''
diseases = []
for index, row in df.iterrows():
new_disease = row[df_col[0]]
if disease != new_disease:
disease = new_disease
diseases.append(disease)
for disease in diseases:
grouped_rows = df[df[df_col[0]] == disease]
link_rows = link_df[link_df[link_df_col[1]] == 'Disease Link']
disease_link_row = link_rows[link_rows[link_df_col[0]] == disease]
if disease_link_row.empty:
data_dict['DiseaseLink'].append('https://www.petmd.com')
else:
for row in disease_link_row.iterrows():
description = row[1][df_col[2]]
data_dict['DiseaseLink'].append(str(description).split(' ')[1][:-1])
break
regex = re.compile('[^a-zA-Z]')
txt = str(disease).strip().replace(" ", "").replace("'", "").replace("inDogs", "")
head, sep, tail = txt.partition(":")
txt2 = head
head, sep, tail = txt2.partition("(")
txt3 = head
head, sep, tail = txt3.partition("/")
txt4 = regex.sub('', str(head))
data_dict['Disease'].append(txt4)
temp = 0
for row in grouped_rows.iterrows():
regexp = re.compile(r'What|what')
topic = row[1][df_col[1]]
description = row[1][df_col[2]]
if regexp.search(str(topic)):
data_dict['DiseaseDescription'].append(str(description))
temp = 1
break
if temp == 0:
data_dict['DiseaseDescription'].append('')
temp = 0
for row in grouped_rows.iterrows():
regexp = re.compile(r'Cause|cause')
topic = row[1][df_col[1]]
description = row[1][df_col[2]]
if regexp.search(str(topic)):
data_dict['DiseaseCauseDescription'].append(str(description))
temp = 1
break
if temp == 0:
data_dict['DiseaseCauseDescription'].append('')
temp = 0
for row in grouped_rows.iterrows():
regexp = re.compile(r'Diagnose|diagnose')
topic = row[1][df_col[1]]
description = row[1][df_col[2]]
if regexp.search(str(topic)):
data_dict['DiseaseDiagnoseDescription'].append(str(description))
temp = 1
break
if temp == 0:
data_dict['DiseaseDiagnoseDescription'].append('')
temp = 0
for row in grouped_rows.iterrows():
regexp = re.compile(r'Prevention|prevention')
topic = row[1][df_col[1]]
description = row[1][df_col[2]]
if regexp.search(str(topic)):
data_dict['DiseasePreventionDescription'].append(str(description))
temp = 1
break
if temp == 0:
data_dict['DiseasePreventionDescription'].append('')
temp = 0
for row in grouped_rows.iterrows():
regexp = re.compile(r'Symptoms|symptoms')
topic = row[1][df_col[1]]
description = row[1][df_col[2]]
if regexp.search(str(topic)):
data_dict['DiseaseSymptomDescription'].append(str(description))
temp = 1
break
if temp == 0:
data_dict['DiseaseSymptomDescription'].append('')
temp = 0
for row in grouped_rows.iterrows():
regexp = re.compile(r'Treatment|treatment')
topic = row[1][df_col[1]]
description = row[1][df_col[2]]
if regexp.search(str(topic)):
data_dict['DiseaseTreatmentDescription'].append(str(description))
temp = 1
break
if temp == 0:
data_dict['DiseaseTreatmentDescription'].append('')
temp = 0
for row in grouped_rows.iterrows():
description = row[1][df_col[2]]
regexp = re.compile(r'Allerg|allerg')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Allergic')
temp = 1
break
regexp = re.compile(r'Bacter|bacter')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Bacterial')
temp = 1
break
regexp = re.compile(r'Flea|flea')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Fleas')
temp = 1
break
regexp = re.compile(r'Fung|fung')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Fungal')
temp = 1
break
regexp = re.compile(r'Virus|virus')
if regexp.search(str(description)):
data_dict['DiseaseInfection'].append('Viral')
temp = 1
break
if temp == 0:
data_dict['DiseaseInfection'].append('')
pd.DataFrame(data_dict).drop_duplicates().to_csv('dogDisease.csv', index=False)
pd.read_csv('test.csv').to_csv('dogDisease.csv', mode='a', index=False, header=False)
This source diff could not be displayed because it is too large. You can view the blob instead.
#Mon Sep 12 20:54:26 IST 2022
jdbc.url=
jdbc.driver=
jdbc.user=
jdbc.password=
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
from owlready2 import *
import csv, types
if __name__ == '__main__':
onto = get_ontology("dogSkinDisease.owl").load()
onto_individuals = get_ontology("http://www.semanticweb.org/dogdisease/ontologies/2022/4/dogDisease.owl")
onto_individuals.imported_ontologies.append(onto)
f = open("dogDisease.csv", encoding='utf-8')
reader = csv.reader(f)
next(reader)
with onto_individuals:
for row in reader:
Disease, DiseaseDescription, DiseaseCauseDescription, DiseaseDiagnoseDescription, DiseasePreventionDescription, DiseaseSymptomDescription, DiseaseTreatmentDescription, DiseaseInfection, DiseaseLink = row
individual = onto.NamedDisease(str(Disease))
if DiseaseDescription:
individual.hasDescription.append(DiseaseDescription)
if DiseaseLink:
individual.seeAlso.append(DiseaseLink)
if DiseaseCauseDescription:
Sub_Class_name_cause = Disease + "Cause"
ClassCause = types.new_class(Sub_Class_name_cause, (onto.DiseaseCause,))
individualCause = ClassCause()
individualCause.hasCasueDescription.append(DiseaseCauseDescription)
individual.hasCause.append(individualCause)
if DiseaseDiagnoseDescription:
Sub_Class_name_diagnose = Disease + "Diagnose"
ClassDiagnose = types.new_class(Sub_Class_name_diagnose, (onto.DiseaseDiagnose,))
individualDiagnose = ClassDiagnose()
individualDiagnose.hasDiagnoseDescription.append(DiseaseDiagnoseDescription)
individual.hasDiagnose.append(individualDiagnose)
if DiseasePreventionDescription:
Sub_Class_name_prevention = Disease + "Prevention"
ClassPrevention = types.new_class(Sub_Class_name_prevention, (onto.DiseasePrevention,))
individualPrevention = ClassPrevention()
individualPrevention.hasPreventionDescription.append(DiseasePreventionDescription)
individual.hasPrevention.append(individualPrevention)
if DiseaseSymptomDescription:
Sub_Class_name_symptom = Disease + "Symptom"
ClassSymptom = types.new_class(Sub_Class_name_symptom, (onto.DiseaseSymptom,))
individualSymptom = ClassSymptom()
individualSymptom.hasSymptomDescription.append(DiseaseSymptomDescription)
individual.hasSymptom.append(individualSymptom)
if DiseaseTreatmentDescription:
Sub_Class_name_treatment = Disease + "Treatment"
ClassTreatment = types.new_class(Sub_Class_name_treatment, (onto.DiseaseTreatment,))
individualTreatment = ClassTreatment()
individualTreatment.hasTreatmentDescription.append(DiseaseTreatmentDescription)
individual.hasTreatment.append(individualTreatment)
if DiseaseInfection:
if DiseaseInfection == 'Allergic':
ClassAllergic = onto.Allergic
individualInfection = ClassAllergic('allergic')
individual.hasInfection.append(individualInfection)
elif DiseaseInfection == 'Bacterial':
ClassBacterial = onto.Bacterial
individualInfection = ClassBacterial('bacterial')
individual.hasInfection.append(individualInfection)
elif DiseaseInfection == 'Fleas':
ClassFleas = onto.Fleas
individualInfection = ClassFleas('fleas')
individual.hasInfection.append(individualInfection)
elif DiseaseInfection == 'Fungal':
ClassFungal = onto.Fungal
individualInfection = ClassFungal('fungal')
individual.hasInfection.append(individualInfection)
elif DiseaseInfection == 'Viral':
ClassViral = onto.Viral
individualInfection = ClassViral('viral')
individual.hasInfection.append(individualInfection)
onto_individuals.save("dogDisease.owl")
Disease,DiseaseDescription,DiseaseCauseDescription,DiseaseDiagnoseDescription,DiseasePreventionDescription,DiseaseSymptomDescription,DiseaseTreatmentDescription,DiseaseInfection,DiseaseLink
Dandruff,"If you’ve noticed white flakes in your dog’s fur, you might be wondering if they have dandruff or whether dogs even get dandruff. Yes, they can. Dandruff, or seborrheic dermatitis, is common in dogs and humans alike. Dandruff is not typically a sign of a serious condition, but you can talk with a veterinarian to find out what may be causing it. Make an appointment sooner rather than later if you see symptoms like extreme itchiness or a change in weight or behavior. Not all dandruff in dogs looks like white flakes. It can be either dry or oily, or it may not even be true dandruff. The underlying skin may or may not be red or patchy from hair loss. Here are the most common types: Seborrhea sicca (dry seborrhea): This dry dandruff may appear as white flakes with crusty skin. Seborrhea oleosa (oily seborrhea): Your dog’s skin may have an oily feel and give off an odor. Walking dandruff: If the dandruff seems like it’s moving, this is called Cheyletiella and is actually a type of mite.","Primary seborrhea is a congenital, genetic disease that typically starts at a young age and gets worse as your dog gets older. West Highland White Terriers, Basset Hounds, American Cocker Spaniels, and English Springer Spaniels are most commonly affected.","A skin scraping to test for mites and lice.An impression cytology (collection) of skin and ear debris to test for a yeast or bacterial infection that looks like seborrhea, such as Malassezia yeast.A blood chemistry panel to screen for diabetes or Cushing’s disease (your vet will need further tests to confirm the diagnosis before starting treatment).A blood test for thyroid hormone levels to determine whether your dog has hypothyroidism.A biopsy to look for autoimmune disease or cancer.","Keep your dog properly fed to prevent dry skin.When bathing your dog, use dog-formulated shampoo to prevent dry skin.Groom your dog regularly; some problems are caused by matted hair that provides breeding grounds for a variety of skin diseases. Regular grooming also helps keep you aware of any initial problems.Keep your dog flea and parasite free.Check your dog regularly for foxtails, burrs, and other sharp objects they may pick up when outside.You should also look for seeds, burrs and sharp plants or objects they may have picked up on a walk.suggest bathing your dog more frequently to prevent dandruff. using a specially formulated dog dandruff shampoo and making sure you dry your pet thoroughly after their bath.omega-3 fatty acids and vitamin E that promote a healthy coat and immune system",Itching that ranges from mild to severe.,"Needs frequent baths with anti-seborrheic shampoos, typically every 2 or 3 days to start with. These shampoos typically contain coal tar and salicylic acid.Frequent bathing is continued for 2-3 weeks or longer, until the skin improves. The goal of bathing is to remove excess keratin. Depending on how your dog responds to treatment, bathing frequency may decrease to every 1 to 2 weeks, or it may stay at every 2 to 3 days.clean your dog’s ears with a medicated ear cleaner every 2 to 3 days. If there is an infection in the ears, your veterinarian will prescribe an ear medication as well. dog may also be started on prednisone to decrease inflammation and debris buildup. Regular rechecks with your veterinarian, typically every one to three weeks, are important to monitor how your dog is responding to treatment.",Allergic,https://www.petmd.com/dog/conditions/skin/c_dg_canine_seborrhea
Dandruff,,"Skin allergies to fleas, food, and the environment.Hypothyroidism, caused by an underactive thyroid gland.Cushing’s disease, caused by an overactive adrenal gland.Diabetes mellitus.Mites and lice.Autoimmune diseases like pemphigus foliaceus, sebaceous adenitis, and lupus erythematosus.A type of cancer called cutaneous epitheliotropic lymphoma.Vitamin deficiencies like zinc-responsive dermatosis and vitamin A-responsive dermatosis.Dogs with lots of skin folds, like Basset Hounds, usually experience more affected skin in those folds.",,,"Very dry, dull coat.Dandruff.Greasy, oily skin that smells bad.Crusted, plaque-like (rough and scaly) skin lesions.Large amount of earwax and ear debris.",,,
Ringworm,,,,,"Itchiness, scratching, or excessive grooming",,,
Ringworm,,,,,"Circular areas of hair loss, often with a red and crusty edge.Broken hair and a poor hair coat.Dry, scaly skin or areas of excessive dandruff. Inflamed areas of skin.Darkened patches of skin. Inflamed nail beds or darkened or dry nails.Dry, brittle, or misshapen nails. circular itchy rash that typically appears on the skin.",,,
YeastInfection,"Yeast are spore-producing fungi that are always present on a dog’s skin, usually in low numbers, as part of the normal flora. A yeast infection happens when there’s an excessive amount of yeast in a certain area. Yeast infections in dogs are quite common and can occur anywhere on the skin, including the ears. Generally, yeast infections are caused by another issue. Anything that diminishes the normal defenses in the skin can make yeast infections more likely. Itchy, irritated skin with a distinct odor can be an indication of a yeast infection, a common skin concern for dogs. A dog will typically develop a yeast infection on the skin or in the ears. Regardless of location, a yeast infection can cause extreme discomfort and can be an indication of a more serious issue.",Yeast infections in dogs are usually secondary problems. This means that there is some other issue that is weakening the skin’s defense mechanisms to allow the yeast to grow in higher numbers than normal.It is very common to see yeast infections in a dog’s ears or on their skin if they have food allergies or environmental allergies. Other underlying issues that may cause yeast infections in dogs include hormonal problems or other diseases that suppress the immune system.,"Yeast infections in a dog’s ears generally cause redness, a brown discharge, head shaking or rubbing, odor, and itching.","Prevention of yeast infections in dogs must include addressing the underlying cause to reduce the likelihood that the infection will reoccur.Routine bathing with an antifungal shampoo may be beneficial. However, for shampoo treatment to be effective, the lather must sit on a dog’s skin for a minimum of 10 minutes before rinsing.Dogs with skin folds may need to have maintenance treatment to keep these areas from becoming too moist, as yeast thrive in moist, dark places such as skin folds and ears.Dogs that have had allergy testing and are determined to be allergic to yeast can be desensitized by including yeast as an ingredient in immunotherapy (allergy vaccine). If you suspect that your dog has a yeast infection, consult your regular veterinarian for a diagnosis and treatment plan that is appropriate for your pet.","Yeast infections on a dog’s mouth or face can cause extreme itching or face rubbing.Dogs with yeast infections on the paws usually lick their paws more than normal. Yeast infections in a dog’s ears can be very itchy, causing dogs to scratch their ears or rub their head excessively.","Your veterinarian may perform cytology (taking a swab of the discharge and staining it to look at it under the microscope) to diagnose a yeast infection in a dog’s ears.Prescription treatment may include antifungal drops or ointment, an ear cleaner, and in severe or difficult-to-treat cases, an oral antifungal medication.",Fungal,https://www.petmd.com
YeastInfection,,,,,"The affected skin may be red, irritated, itchy, greasy, or flaky, and there may be hair loss.If the infection is chronic, the skin may thicken and become darker in color. A dog with yeast infections on their paws can have red, irritated, and itchy paws.The underside of the paws, between the pads, is affected most often, but yeast can occur anywhere on the paws. Sometimes a brown discharge can be seen in the nail beds.Dog ear yeast infections are quite common, and the ears often smell sweet or musty.Usually, you will see redness, which may extend onto the flap of the ear, and the discharge is generally brown. The ear may appear to be greasy, and the hair may be matted.",,,
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (pythonProject) (2)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/pythonProject.iml" filepath="$PROJECT_DIR$/.idea/pythonProject.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BookCrawlerItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class BookCrawlerSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class BookCrawlerDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class BookCrawlerPipeline:
def process_item(self, item, spider):
return item
# Scrapy settings for book_crawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'book_crawler'
SPIDER_MODULES = ['book_crawler.spiders']
NEWSPIDER_MODULE = 'book_crawler.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'book_crawler (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'book_crawler.middlewares.BookCrawlerSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'book_crawler.middlewares.BookCrawlerDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'book_crawler.pipelines.BookCrawlerPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
class BooksSpider(Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:/Users/Asus/Documents/chromedriver')
self.driver.get('http://books.toscrape.com')
sel = Selector(text=self.driver.page_source)
books = sel.xpath('//h3/a/@href').extract()
for book in books:
url = 'http://books.toscrape.com/' + book
yield Request(url, callback=self.parse_book)
while True:
try:
next_page = self.driver.find_element(by=By.XPATH, value='//a[text()="next"]')
sleep(3)
self.logger.info('Sleeping for 3 seconds')
next_page.click()
sel = Selector(text=self.driver.page_source)
books = sel.xpath('//h3/a/@href').extract()
for book in books:
url = 'http://books.toscrape.com/catalogue/' + book
yield Request(url, callback=self.parse_book)
except NoSuchElementException:
self.logger.info('No more pages to load.')
self.driver.quit()
break
def parse_book(self, response):
pass
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = book_crawler.settings
[deploy]
#url = http://localhost:6800/
project = book_crawler
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class FirstscrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class FirstscrapySpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class FirstscrapyDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class FirstscrapyPipeline:
def process_item(self, item, spider):
return item
# Scrapy settings for firstScrapy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'firstScrapy'
SPIDER_MODULES = ['firstScrapy.spiders']
NEWSPIDER_MODULE = 'firstScrapy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'firstScrapy (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'firstScrapy.middlewares.FirstscrapySpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'firstScrapy.middlewares.FirstscrapyDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'firstScrapy.pipelines.FirstscrapyPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
class Quotes2Spider(scrapy.Spider):
name = 'quotes2'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
# h1_tag = response.xpath('//h1/a/text()').extract_first()
# tag = response.xpath('//*[@class="tag-item"]/a/text()').extract()
# yield {'H1 Tag': h1_tag, 'Tags': tag}
quotes = response.xpath('//*[@class="quote"]')
for quote in quotes:
text = quote.xpath('.//*[@class="text"]/text()').extract_first()
author = quote.xpath('.//*[@itemprop="author"]/text()').extract_first()
tags = quote.xpath('.//*[@itemprop="keywords"]/@content').extract_first()
yield {'Text': text,
'Author': author,
'Tags': tags}
next_page_url = response.xpath('//*[@class="next"]/a/@href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def start_requests(self):
urls = ['http://quotes.toscrape.com/page/1/'
'http://quotes.toscrape.com/page/2/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response, **kwargs):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = firstScrapy.settings
[deploy]
#url = http://localhost:6800/
project = firstScrapy
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class FirstvetItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class FirstvetSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class FirstvetDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class FirstvetPipeline:
def process_item(self, item, spider):
return item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment