llama_index [问题]:rel_props没有保存到持久存储中,

hsgswve4  于 23天前  发布在  其他
关注(0)|答案(6)|浏览(18)

问题验证

  • 我已经在文档和discord上搜索过答案了。

问题

你好,我想知道为什么rel_props没有被保存到我的图索引持久存储中?这可能导致了我的问题,即当我从持久存储中加载知识图谱后查询索引时,什么都没有返回。以下是我创建和存储持久存储的脚本,但是在我脚本之后,我的持久目录storage_graph中的index_store.json示例被追加了。
脚本:

from flask import Flask, request, jsonify
import os
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Document,
    Settings,
    PromptTemplate,
    KnowledgeGraphIndex
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core.schema import TextNode
import base64, logging, json

logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512

os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"

space_name = "test9"
edge_types, rel_prop_names = ["relationship"], ["relationship"]
tags = ["entity"]

def encode_string(s):
    return base64.urlsafe_b64encode(s.encode()).decode()

def decode_string(s):
    return base64.urlsafe_b64decode(s.encode()).decode()

def sanitize_and_encode(data):
    sanitized_data = {}
    for key, value in data.items():
        if isinstance(value, str):
            sanitized_data[key] = encode_string((value))
        else:
            sanitized_data[key] = value
    return sanitized_data

def decode_metadata(metadata):
    decoded_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, str):
            decoded_metadata[key] = decode_string(value)
        else:
            decoded_metadata[key] = value
    return decoded_metadata

def load_json_nodes(json_directory):
    nodes = []
    for filename in os.listdir(json_directory):
        if filename.endswith('.json'):
            with open(os.path.join(json_directory, filename), 'r') as file:
                data = json.load(file)
                for node_data in data:
                    sanitized_metadata = sanitize_and_encode(node_data['metadata'])
                    node = TextNode(
                        text=encode_string((node_data['text'])),
                        id_=node_data['id_'],
                        embedding=node_data['embedding'],
                        metadata=sanitized_metadata
                    )
                    nodes.append(node)
                    logging.debug(f"Loaded node ID: {node.id_}, text: {node_data['text']}, metadata: {node_data['metadata']}")
                    
    return nodes

def create_index():
    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags
    )

    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    
    json_nodes = load_json_nodes("JSON_nodes_999_large_syll")
    documents = [
        Document(
            text=decode_string(node.text),
            id_=node.id_,
            metadata=decode_metadata(node.metadata),
            embedding=node.embedding
        ) for node in json_nodes
    ]
    
    kg_index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        max_knowledge_sequence=15,
        include_embeddings=True
    )
    
    # Set the index_id for KnowledgeGraphIndex
    kg_index.set_index_id("kg_index")
    
    kg_index.storage_context.persist(persist_dir='./storage_graph_test10')
    logging.debug(f"KG Index created with {len(documents)} documents")

    # Create VectorStoreIndex
    vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
    
    # Set the index_id for VectorStoreIndex
    vector_index.set_index_id("vector_index")
    
    # Persist the storage context
    storage_context.persist(persist_dir='./storage_graph_test10')
    logging.debug(f"Vector Index created with {len(documents)} documents")
    return kg_index, vector_index, storage_context

print("Creating Index...")
kg_index, vector_index, storage_context = create_index()
print("Index Created...")

index_store.json文件:

{"index_store/data": {"kg_index": {"__type__": "kg", "__data__": "{\"index_id\": \"kg_index\", \"summary\": null, \"table\": {\"Mech 3202\": [\"48ae3288-8d75-429f-864d-0ba2054433d9\"], \"Fluid dynamics\": [\"48ae3288-8d75-429f-864d-0ba2054433d9\"], \"York university\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\", \"48ae3288-8d75-429f-864d-0ba2054433d9\"], \"Mech 3202 fluid dynamics\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Course_name\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"University\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Text\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Content_type\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Pse 317\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"], \"Tutorial location\": [\"f202ac7e-5343-4def-a94e-976dac45b27d\"]}, \"rel_map\": {}, \"embedding_dict\": {\"('Mech 3202', 'Is', 'Fluid dynamics')\": [0. ...
lmyy7pcs

lmyy7pcs1#

rel_props 未被保存到持久存储,因为 refresh_schema 方法没有被正确调用或执行。此方法负责更新 structured_schema 字典,其中包括 rel_props
要解决此问题,请确保在关系或其属性发生更改后调用 refresh_schema 方法。可以通过在修改关系后显式调用 self.refresh_schema() 来实现这一点。
以下是调用 refresh_schema 的相关代码部分:

def __init__(
        self,
        username: str,
        password: str,
        url: str,
        database: Optional[str] = "neo4j",
        refresh_schema: bool = True,
        sanitize_query_output: bool = True,
        enhanced_schema: bool = False,
        **neo4j_kwargs: Any,
    ) -> None:
        self.sanitize_query_output = sanitize_query_output
        self.enhanced_schema = enhanced_schema
        self._driver = neo4j.GraphDatabase.driver(
            url, auth=(username, password), **neo4j_kwargs
        )
        self._async_driver = neo4j.AsyncGraphDatabase.driver(
            url,
            auth=(username, password),
            **neo4j_kwargs,
        )
        self._database = database
        self.structured_schema = {}
        if refresh_schema:
            self.refresh_schema()

确保在初始化类时将 refresh_schema 设置为 True ,或者在关系发生任何更新后手动调用 self.refresh_schema()
此外,请确保 persist 方法正确实现,以将图形存储到文件中。以下是如何实现 persist 方法的示例:

def persist(
    self, persist_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
) -> None:
    """Persist the graph store to a file."""
    if fs is None:
        fs = fsspec.filesystem("file")
    with fs.open(persist_path, "w") as f:
        f.write(self.graph.json())

此方法确保将图的当前状态(包括 rel_props )保存到指定的文件中。
根据 PropertyGraphStore 类,persist 方法被定义为抽象方法,这意味着它必须在任何子类中实现。请确保您的子类正确实现此方法以处理 rel_props 的持久性:

class MyGraphStore(PropertyGraphStore):
    def persist(
        self, persist_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
    ) -> None:
        """Persist the graph store to a file."""
        if fs is None:
            fs = fsspec.filesystem("file")
        with fs.open(persist_path, "w") as f:
            f.write(self.graph.json())

通过确保适当调用 refresh_schema 并正确实现 persist 方法,您可以解决 rel_props 未保存到持久存储 1(https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py)2(https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/graph_stores/simple_labelled.py)3(https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/graph_stores/types.py) 的问题。

  • 要继续与 Dosu 对话,请提及 @dosu。*
u1ehiz5o

u1ehiz5o2#

所以有可能将我的rel_props保存到持久图存储rel_maps中,还是我必须连接到我的nebula图空间来检索这些rel_props?

ibrsph3r

ibrsph3r3#

是的,可以将您的 rel_props 保存到持久图存储 rel_maps 中。NebulaPropertyGraphStore 类提供了插入三元组和管理关系的方法,这意味着您可以直接将您的 rel_props 存储在图存储中。
以下是 NebulaPropertyGraphStore 类中的相关方法:

def upsert_triplet(self, subj: str, rel: str, obj: str) -> None:
    """Add triplet."""
    # Note, to enable leveraging existing knowledge graph,
    # the (triplet -- property graph) mapping
    #   makes (n:1) edge_type.prop_name --> triplet.rel
    # thus we have to assume rel to be the first edge_type.prop_name
    # here in upsert_triplet().
    # This applies to the type of entity(tags) with subject and object, too,
    # thus we have to assume subj to be the first entity.tag_name

此方法允许您向图中添加一个三元组(主语、关系、宾语),这意味着您可以直接将您的 rel_props 保存到持久图存储中,而无需从 Nebula 图空间 1(https://github.com/run-llama/llama_index/blob/main/llama-index-legacy/llama_index/legacy/graph_stores/nebulagraph.py)2(https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/graph_stores/llama-index-graph-stores-nebula/llama_index/graph_stores/nebula/nebula_graph_store.py)3(https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/graph_stores/llama-index-graph-stores-nebula/llama_index/graph_stores/nebula/nebula_property_graph.py) 检索它们。

  • 要继续与 Dosu 对话,请提及 @dosu。*
anauzrmj

anauzrmj4#

How can I add include this to my previous script that creates both vector and graph indices, and saves them to a persistent storage?
@dosu

mec1mxoz

mec1mxoz5#

so hwo can I update my script to ensure the rel_map is being populated within my index_store.json in my persistent storage?
@dosu

nukf8bse

nukf8bse6#

为了确保在持久存储的 index_store.json 中正确填充 rel_map,您需要验证 get_rel_map 方法是否正确获取和结构化关系数据。以下是如何更新脚本以包含此功能的方法:

  1. 确保 get_rel_map 正确实现:验证 get_rel_map 方法是否正确获取和结构化关系数据。

  2. rel_map 持久化到 index_store.json:将 rel_map 字典序列化为 JSON 并写入文件。

以下是带有这些更改的脚本更新版本:


# 将 

from flask import Flask, request, jsonify
import os
from llama_index.core import (
VectorStoreIndex,
StorageContext,
Document,
Settings,
PromptTemplate,
KnowledgeGraphIndex
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.schema import QueryBundle
from llama_index.core.base.base_retriever import BaseRetriever
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core.schema import TextNode
import base64, logging, json

logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
Settings.chunk_size = 512

os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"

space_name = "test9"
edge_types, rel_prop_names = ["relationship"], ["relationship"]
tags = ["entity"]

def encode_string(s):
return base64.urlsafe_b64encode(s.encode()).decode()

def decode_string(s):
return base64.urlsafe_b64decode(s.encode()).decode()

def sanitize_and_encode(data):
sanitized_data = {}
for key, value in data.items():
if isinstance(value, str):
sanitized_data[key] = encode_string((value))
else:
sanitized_data[key] = value
return sanitized_data

def decode_metadata(metadata):
decoded_metadata = {}
for key, value in metadata.items():
if isinstance(value, str):
decoded_metadata[key] = decode_string(value)
else:
decoded_metadata[key] = value
return decoded_metadata

def load_json_nodes(json_directory):
nodes = []
for filename in os.listdir(json_directory):
if filename.endswith('.json'):
with open(os.path.join(json_directory, filename), 'r') as file:
data = json.load(file)
for node_data in data:
sanitized_metadata = sanitize_and_encode(node_data['metadata'])
node = TextNode(
text=encode_string((node_data['text']))),
id_=node_data['id_'],
embedding=node_data['embedding'],
metadata=sanitized_metadata
)
nodes.append(node)
logging.debug(f"Loaded node ID: {node.id_}, text: {node_data['text']}, metadata: {node_data['metadata']}")

return nodes

def save_rel_map_to_json(rel_map, file_path):
with open(file_path, 'w') as json_file:
json.dump(rel_map, json_file, indent=4)

def create_index():
graph_store = NebulaGraphStore(
space_name=space_name,
edge_types=edge_types,
rel_prop_names=rel_prop_names,
tags=tags
)

storage_context = StorageContext.from_defaults(graph_store=graph_store)

json_nodes = load_json_nodes("JSON_nodes_999_large_syll")
documents = [
    Document(
        text=decode_string(node.text),
        id_=node.id_,
        metadata=decode_metadata(node.metadata),
        embedding=node.embedding
    ) for node in json_nodes
]

kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=10,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    max_knowledge_sequence=15,
    include_embeddings=True
)

# Set the index_id for KnowledgeGraphIndex
kg_index.set_index_id("kg_index")

kg_index.storage_context.persist(persist_dir='./storage_graph_test10')
logging.debug(f"KG Index created with {len(documents)} documents")

# Create VectorStoreIndex
vector_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

# Set the index_id for VectorStoreIndex
vector_index.set_index_id("vector_index")

# Persist the storage context
storage_context.persist(persist_dir='./storage_graph_test10')
logging.debug(f"Vector Index created with {len(documents)} documents")

# Get and save the rel_map
rel_map = graph_store.get_rel_map()
save_rel_map_to_json(rel_map, './storage_graph_test10/index_store.json')

return kg_index, vector_index, storage_context

print("Creating Index...")
kg_index, vector_index, storage_context = create_index()
print("Index Created...")

 替换为实际的代码

此脚本确保 rel_map 正确填充并保存到持久存储 index_store.json 中的 rel_map

  • 要继续与 Dosu 进行对话,请提及 @dosu。*

相关问题