from llama_index.core import SimpleDirectoryReader
# Load data
documents = SimpleDirectoryReader(
input_files=["./data/paul_graham_essay.txt"]
).load_data()
#documents
from llama_index.core.node_parser import SentenceWindowNodeParser
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
window_size=3,
window_metadata_key="window",
original_text_metadata_key="original_text",
)
# Extract nodes from documents
nodes = node_parser.get_nodes_from_documents(documents)
# This block of code is for educational purposes
# to showcase what the nodes looks like
i=10
print(f"Text: \n{nodes[i].text}")
print("------------------")
print(f"Window: \n{nodes[i].metadata['window']}")
Text:
So this is not about whether it's ok to kill killers.
------------------
Window:
Defendants' lawyers are often incompetent. And prosecutors are often motivated more by publicity than justice.
In the real world, [about 4%](http://time.com/79572/more-innocent-people-on-death-row-than-estimated-study/) of people sentenced to death are innocent. So this is not about whether it's ok to kill killers. This is about whether it's ok to kill innocent people.
A child could answer that one for you.
This year, in California, you have a chance to end this, by voting yes on Proposition 62.
import weaviate
# Connect to your Weaviate instance
client = weaviate.Client(
embedded_options=weaviate.embedded.EmbeddedOptions(),
)
print(f"Client is ready: {client.is_ready()}")
# Print this line to get more information about the client
# client.get_meta()
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
index_name = "MyExternalContext"
# Construct vector store
vector_store = WeaviateVectorStore(
weaviate_client = client,
index_name = index_name
)
# Set up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# If an index with the same index name already exists within Weaviate, delete it
if client.schema.exists(index_name):
client.schema.delete_class(index_name)
# Setup the index
# build VectorStoreIndex that takes care of chunking documents
# and encoding chunks to embeddings for future retrieval
index = VectorStoreIndex(
nodes,
storage_context = storage_context,
)
{
"class": "MyExternalContext",
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"invertedIndexConfig": {
"bm25": {
"b": 0.75,
"k1": 1.2
},
"cleanupIntervalSeconds": 60,
"stopwords": {
"additions": null,
"preset": "en",
"removals": null
}
},
"multiTenancyConfig": {
"enabled": false
},
"properties": [
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "_node_content",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "file_path",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "file_type",
"tokenization": "word"
},
{
"dataType": [
"uuid"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": false,
"name": "doc_id"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "text",
"tokenization": "word"
},
{
"dataType": [
"number"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": false,
"name": "file_size"
},
{
"dataType": [
"uuid"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": false,
"name": "document_id"
},
{
"dataType": [
"uuid"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": false,
"name": "ref_doc_id"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "file_name",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "_node_type",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "last_accessed_date",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "creation_date",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:39 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "last_modified_date",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:48 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "original_text",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"description": "This property was generated by Weaviate's auto-schema feature on Wed Feb 14 13:46:48 2024",
"indexFilterable": true,
"indexSearchable": true,
"name": "window",
"tokenization": "word"
}
],
"replicationConfig": {
"factor": 1
},
"shardingConfig": {
"virtualPerPhysical": 128,
"desiredCount": 1,
"actualCount": 1,
"desiredVirtualCount": 128,
"actualVirtualCount": 128,
"key": "_id",
"strategy": "hash",
"function": "murmur3"
},
"vectorIndexConfig": {
"skip": false,
"cleanupIntervalSeconds": 300,
"maxConnections": 64,
"efConstruction": 128,
"ef": -1,
"dynamicEfMin": 100,
"dynamicEfMax": 500,
"dynamicEfFactor": 8,
"vectorCacheMaxObjects": 1000000000000,
"flatSearchCutoff": 40000,
"distance": "cosine",
"pq": {
"enabled": false,
"bitCompression": false,
"segments": 0,
"centroids": 256,
"trainingLimit": 100000,
"encoder": {
"type": "kmeans",
"distribution": "log-normal"
}
}
},
"vectorIndexType": "hnsw",
"vectorizer": "none"
}
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
# The target key defaults to `window` to match the node_parser's default
postproc = MetadataReplacementPostProcessor(
target_metadata_key="window"
)
# This block of code is for educational purposes
# to showcase how the MetadataReplacementPostProcessor works
#from llama_index.core.schema import NodeWithScore
#from copy import deepcopy
#scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
#nodes_old = [deepcopy(n) for n in nodes]
#replaced_nodes = postproc.postprocess_nodes(scored_nodes)
#print(f"Retrieved sentece: {nodes_old[i].text}")
#print("------------------")
#print(f"Replaced window: {replaced_nodes[i].text}")
from llama_index.core.postprocessor import SentenceTransformerRerank
# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
top_n = 2,
model = "BAAI/bge-reranker-base"
)
# The QueryEngine class is equipped with the generator
# and facilitates the retrieval and generation steps
query_engine = index.as_query_engine(
similarity_top_k = 6,
vector_store_query_mode="hybrid",
alpha=0.5,
node_postprocessors = [postproc, rerank],
)
# Use your Default RAG
response = query_engine.query(
"What happened at Interleaf?"
)
print(str(response))
Interleaf는 Emacs에서 영감을 받아 소프트웨어에 스크립팅 언어를 추가하고 이 스크립팅 언어를 Lisp의 방언으로 만들었습니다.
Window: 당시에는 깨닫지 못했지만 Viaweb을 운영하는 데 따른 노력과 스트레스로 지쳐가고 있었어요. 캘리포니아에 도착한 후 한동안은 새벽 3시까지 프로그래밍을 계속하는 평소 방식을 유지하려고 했지만, 피로가 야후의 오래된 문화와 산타클라라의 음침한 큐브 농장과 결합하면서 점차 지쳐갔습니다. 몇 달이 지나자 당황스러울 정도로 인터리프에서 일하는 것이 싫게 느껴졌습니다.
야후는 저희를 인수할 때 많은 옵션을 제시했습니다. 당시에는 야후가 너무 고평가되어 가치가 없을 거라고 생각했는데 놀랍게도 1년 만에 주가가 5배나 올랐죠.
------------------
원문 문장: 몇 달이 지나자 당황스러울 정도로 인터리프에서 일하는 기분이 들었습니다.