import loadOntology
import csv
from sklearn.feature_extraction.text import CountVectorizer
import py_stringmatching
from py_stringmatching import Jaro,JaroWinkler
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import fasttext
from sklearn.cluster import DBSCAN,SpectralClustering
import rdflib
from rdflib import RDF,RDFS,OWL

ontologiesname = ['Crosscult-ontology-v3.8-crm-rdf.owl', 'cultural-ON.owl', 'cidoc crm.owl', 'edm-v524-130522-crm.owl',
                  'arco_2019-06-01-rdf.owl',
                  'drammar.owl', 'GettyVocabularyProgrameOntology.rdf', 'hico2020-rdf.owl', 'oad.owl',
                  'frbr-core-20050810.rdf', 'bibo.rdf', 'bibframe.rdf','RiC-O_v0.2.rdf','vir-rdf.owl','biro1.1.owl']
csvfilepath = 'D:\\待写论文\\文化遗产领域本体评估与剖析\\实验结果2\\本体类列表\\'
ontologiesFilepath ="D:\\待写论文\\文化遗产领域本体评估与剖析\\ontologies\\本体文件与说明\\"

def writeCSV(index):
    # 加载本体文件，输出IRI和label到CSV文件。

    with open(csvfilepath+ontologiesname[index]+".csv",'w',newline='') as f:
        f_csv = csv.writer(f)
        classesIRI,classesLable = loadOntology.getAllClasses(ontologiesFilepath,ontologiesname[index])

        f_csv.writerows(zip(classesIRI,classesLable))


# 获取父类，根据URI返回父类的URI
def getFatherclasses(graph, URI):
    fatherclasseslist = []
    triples = graph.triples((rdflib.term.URIRef(URI), RDFS.subClassOf, None))
    for t in triples:
        if type(t[2]) != rdflib.term.BNode:
            fatherURI = str(t[2])
            fatherclasseslist.append(fatherURI)
    return fatherclasseslist

# 获取equivlentClass。 要考虑反向。
def getEquivlentclass(graph,URI):
    equiclass = []

    str1 = "<"+URI+">"
    q = """
        PREFIX owl:  <http://www.w3.org/2002/07/owl#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>  
        SELECT ?e WHERE {{
        """+str1+""" owl:equivalentClass ?e.
        }UNION{
        ?e owl:equivalentClass """+str1+""".
        }
        }"""
    for r in graph.query(q):
        if r[0] != rdflib.term.BNode:
            equiclass.append(str(r[0]))
    return equiclass




############
# 读取CSV文件，生成概念之间的相似度矩阵666维稀疏矩阵
def SimilarityMatrix():
    countNum = 0
    allClassesList =[] # 12个元素的列表嵌套
    resultTitle =[]
    for index in range(0, 15):
        with open(csvfilepath + ontologiesname[index] + ".csv", 'r') as f:
            reader = csv.reader(f)
            allClassesList.append(list(reader))

    #类的名称，标题
    for i in range(0,15):
        countNum =countNum+ len(allClassesList[i])
        for j in range(len(allClassesList[i])):
            resultTitle.append(allClassesList[i][j][1])

        print(len(allClassesList[i]))
    print("############")

    resultMatrix =np.zeros((countNum,countNum))
    ft = fasttext.load_model('cc.en.100.bin')

    index =0
    indexy =len(allClassesList[0])
    for x in range(0,15):
        index = index + len(allClassesList[x])
        indexy = index ### 2. 重新复位 indexy
        # 加载相应的本体以获取类的父类
        onto_url ="D:\\待写论文\\文化遗产领域本体评估与剖析\\ontologies\\本体文件与说明\\" + ontologiesname[x]
        ontologygraph = rdflib.Graph()
        file_format = rdflib.util.guess_format(onto_url)
        ontologygraph.parse(onto_url, format=file_format)

        for y in range(x+1,15):
            indexy = indexy+ len(allClassesList[y])##### 1. 在x不变的情况下，记录匹配的位置
            # 加载相应的本体以获取类的父类
            onto_url = "D:\\待写论文\\文化遗产领域本体评估与剖析\\ontologies\\本体文件与说明\\" + ontologiesname[y]
            targetontologygraph = rdflib.Graph()
            file_format = rdflib.util.guess_format(onto_url)
            targetontologygraph.parse(onto_url, format=file_format)

            for i in range(len(allClassesList[x])):
                # 获取第X个本体的第i个类的父类
                fatherclass = getFatherclasses(ontologygraph,allClassesList[x][i][0])
                # 获取第X个本体的第i个类的等同类
                equivlentclasses = getEquivlentclass(ontologygraph, allClassesList[x][i][0])
                for j in range(len(allClassesList[y])):
                    #print((index-len(allClassesList[x])+i),(indexy-len(allClassesList[y])+j))
                    # 获取 第Y个本体的第j类的等同类
                    targetequivlentclasses = getEquivlentclass(targetontologygraph,allClassesList[y][j][0])

                    MatrixA = index - len(allClassesList[x])
                    MatrixB = indexy- len(allClassesList[y])
                    # 如果IRI相同
                    if allClassesList[x][i][0] == allClassesList[y][j][0]:
                        resultMatrix[MatrixA+i][MatrixB+j] = 1.0
                        print("复用")
                    else:
                        sourcelabel = allClassesList[x][i][1]
                        targetlabel = allClassesList[y][j][1]
                        # 如果名称相同。或者改成编辑距离，如果大于阈值则判定为相等
                        if sourcelabel == targetlabel:
                            resultMatrix[MatrixA+i][MatrixB + j] = 1.0  #
                        else:
                            # ③判断是否定义了equivalentClass相等的关系.这种情况其实比较少，也要判断。由于是单向判断，所以要双向判断两次。
                            if allClassesList[y][j][0] in equivlentclasses:
                                resultMatrix[MatrixA+i][MatrixB+j] = 1.0
                                print('s相等关系')
                            if allClassesList[x][i][0] in targetequivlentclasses:
                                resultMatrix[MatrixA+i][MatrixB+j] = 1.0
                                print('t相等关系')

                            # ①判断target与source父类之间是不是直接复用关系，如果是复用，则source与targe之间认定为相等。 #只通过URI判断是不是直接复用的情况。
                            if allClassesList[y][j][0] in fatherclass:
                                resultMatrix[MatrixA+i][MatrixB+j] = 1.0  #
                                print("父类复用")
                            else:
                                # ②语义相似度 #
                                sklearnSM = cosine_similarity([ft.get_sentence_vector(sourcelabel), ft.get_sentence_vector(targetlabel)])
                                SM = round(sklearnSM[0][1],3)
                                # 保留相似度较高的关系。
                                if SM >= 0.75:
                                    resultMatrix[MatrixA+i][MatrixB+j] = SM  #

    # 如果用Gephi 可视化就不对称了，
    #re = resultMatrix + resultMatrix.T
    #相似度计算结果 矩阵 存为CSV文件
    np.savetxt('resultSIM.csv', resultMatrix, delimiter=',',fmt='%.3f')
    #同时！将标题存储
    with open("resultTitle.csv", 'w', newline='') as f:
        f_csv = csv.writer(f)
        f_csv.writerows(zip(resultTitle))
    # 将标题插入相似度矩阵，用于Gephi可视化





if __name__ == "__main__":
    # a.首先解析每一个本体文件，生成类列表的CSV文件。
    for i in range(7,14):
        writeCSV(i)
    # b. 计算本体所有类之间的语义相似度
    SimilarityMatrix()

