0%

向量資料庫

dbs物件

在akasha中,chromadb建立完之後,會被儲存成dbs物件,會儲存chromadb中的文件內容、metadata、向量資料、unique id,並被使用於後續的vector similarity search。

該物件可以添加多個chromadb資料,也可與其他dbs物件互相結合,也可根據filter抽取出需要的向量資料。

建立向量資料

process_db

process_db可對多個文件集(list of directory)建立chromadb,並回傳dbs物件與建立不成功的檔案list,若文件內容、使用嵌入模型、chunk size相等的chromadb已存在,則不會重新創建而直接讀取。

文件內容改變也會建立新的chromadb,設定參數ignore_check=True則不進行文件內容更改與否的確認,可更快的進行chromadb的讀取。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import akasha
import akasha.utils.db as adb
data_source = ["docs/mic", "docs/1.pdf", "https://github.com/iii-org/akasha"]
emb_name = "openai:text-embedding-3-small"
db, ignore_files = adb.process_db(
data_source=data_source,
embeddings=emb_name,
chunk_size=1000
verbose=True,
)

### dbs object is a class that stores all information of the chromadb ###
db.get_docs()
db.get_embeds()
db.get_metadatas
db.get_ids()


使用dbs物件

init

您可以直接宣告akasha.utils.db.dbs()建立空的dbs物件,也可以利用已建立的chromadb建立dbs物件。

dbs物件包含ids(每個文字段落的unique id), embeds(每個文字段落的向量), metadatas(每個文字段落的後設資料), docs(每個文字段落的內容) 。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah
db1 = adb.dbs()

### use chromadb to initialize dbs object ###
storage_directory = "chromadb/12345"
emb_obj = ah.handle_embeddings()
docsearch = Chroma(persist_directory=storage_directory,
embedding_function=emb_obj)
db2 = adb.dbs(docsearch)

print(len(db2.get_ids())) # list[str]
print(len(db2.get_embeds())) # list[list[float]]
print(len(db2.get_metadatas())) #list[dict]
print(len(db2.get_docs())) #list[dict]

merge

dbs物件之間可以使用.merge相互結合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah

emb_obj = ah.handle_embeddings()

docsearch1 = Chroma(persist_directory="chromadb/123",
embedding_function=emb_obj)
db1 = adb.dbs(docsearch1)

docsearch2 = Chroma(persist_directory="chromadb/456",
embedding_function=emb_obj)
db2 = adb.dbs(docsearch2)

db2.merge(db1)

print(len(db2.get_ids())) # list[str]
print(len(db2.get_embeds())) # list[list[float]]
print(len(db2.get_metadatas())) #list[dict]
print(len(db2.get_docs())) #list[dict]

add_chromadb

dbs物件可以添加新的chromadb資料

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah

emb_obj = ah.handle_embeddings()

docsearch1 = Chroma(persist_directory="chromadb/123",
embedding_function=emb_obj)

docsearch2 = Chroma(persist_directory="chromadb/456",
embedding_function=emb_obj)

db = adb.dbs(docsearch1)

db.add_chromadb(docsearch2)

print(len(db.get_ids())) # list[str]
print(len(db.get_embeds())) # list[list[float]]
print(len(db.get_metadatas())) #list[dict]
print(len(db.get_docs())) #list[dict]

get_Documents

使用get_Docuemnts可以得到當前dbs物件中儲存的Documents list (包含page_contents文件內容和metadata後設資料)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah

emb_obj = ah.handle_embeddings()

docsearch1 = Chroma(persist_directory="chromadb/123",
embedding_function=emb_obj)

db = adb.dbs(docsearch1)

docs = db.get_Documents()



print([doc.page_contents for doc in docs]) # list[str]
print([docs.metadata for doc in docs]) # list[dict]



載入chromadb

成功建立chromadb後,若想再載入chromadb,可以使用 get_storage_directory取得chromadb路徑並進行載入,取得dbs物件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import akasha.utils.db as adb
data_source = ["docs/mic"]
emb_name = "openai:text-embedding-3-small"
chunk_size = 1000
### create chromadb ###
db, ignore_files = adb.process_db(
data_source=data_source,
embeddings=emb_name,
chunk_size=chunk_size,
verbose=True,
)


### load chromadb ###
embed_type, embed_name = emb_name.split(":")
chromadb_mic_dir = adb.get_storage_directory(
"docs/mic", chunk_size, embed_type, embed_name
)

### after you created the chromadb, you can also load it by chroma_name ###
chroma_list = [chromadb_mic_dir]
db, ignore_files = adb.load_db_by_chroma_name(chroma_name_list=chroma_list)

載入文件

若不想建立向量資料庫,只需要載入文件,可以使用 load_docs_from_info單純載入文件成list of Documents

1
2
3
4
5
6
import akasha.utils.db as adb
data_source = ["docs/mic", "docs/1.pdf", "https://github.com/iii-org/akasha"]
docs = adb.load_docs_from_info(info=data_source, verbose=True)

print(docs[0].page_content)



提取dbs物件

extract_db_by_file

extract_db_by_file可以將檔名符合file_name_list中的所有資料提取出來生成新的dbs物件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah

emb_obj = ah.handle_embeddings()

docsearch1 = Chroma(persist_directory="chromadb/123",
embedding_function=emb_obj)

db = adb.dbs(docsearch1)
file_name_list = ['f1.txt', 'f2.docx']

extracted_db = adb.extract_db_by_file(db=db, file_name_list=file_name_list)

print(len(extracted_db.get_ids())) # list[str]
print(len(extracted_db.get_embeds())) # list[list[float]]
print(len(extracted_db.get_metadatas())) #list[dict]
print(len(extracted_db.get_docs())) #list[dict]

extract_db_by_keyword

extract_db_by_keyword可以將文字段落中存在任何keyword_list中keyword的所有資料提取出來生成新的dbs物件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah

emb_obj = ah.handle_embeddings()

docsearch1 = Chroma(persist_directory="chromadb/123",
embedding_function=emb_obj)

db = adb.dbs(docsearch1)
keyword_list = ["資訊產業策進會", "AI人工智慧"]

extracted_db = adb.extract_db_by_keyword(db=db, keyword_list=keyword_list)

print(len(extracted_db.get_ids())) # list[str]
print(len(extracted_db.get_embeds())) # list[list[float]]
print(len(extracted_db.get_metadatas())) #list[dict]
print(len(extracted_db.get_docs())) #list[dict]

extract_db_by_ids

extract_db_by_ids可以將存在id_list中的的所有資料提取出來生成新的dbs物件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah

emb_obj = ah.handle_embeddings()

docsearch1 = Chroma(persist_directory="chromadb/123",
embedding_function=emb_obj)

db = adb.dbs(docsearch1)
id_list = ['2024-10-21-17_45_21_963065_0', '2024-10-21-17_45_23_601845_0']

extracted_db = adb.extract_db_by_ids(db=db, id_list=id_list)

print(len(extracted_db.get_ids())) # list[str]
print(len(extracted_db.get_embeds())) # list[list[float]]
print(len(extracted_db.get_metadatas())) #list[dict]
print(len(extracted_db.get_docs())) #list[dict]

pop_db_by_ids

pop_db_by_ids會將所選id_list中的的所有資料從dbs物件中移除

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah

emb_obj = ah.handle_embeddings()

docsearch1 = Chroma(persist_directory="chromadb/123",
embedding_function=emb_obj)

db = adb.dbs(docsearch1)
id_list = ['2024-10-21-17_45_21_963065_0', '2024-10-21-17_45_23_601845_0']

adb.pop_db_by_ids(db=db, id_list=id_list)

print(len(db.get_ids())) # list[str]
print(len(db.get_embeds())) # list[list[float]]
print(len(db.get_metadatas())) #list[dict]
print(len(db.get_docs())) #list[dict]



刪除chromadb內容

在akasha中,同個資料夾內的所有文件會儲存在同一個chromadb資料庫中,若你想刪除整個chromadb資料庫,可以使用 delete_documents_by_directory
若要刪除單一文件,可以使用 delete_documents_by_file

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import akasha
from langchain_chroma import Chroma
import akasha.utils.db as adb
import akasha.helper as ah

data_source = ["docs/mic", "docs/1.pdf", "https://github.com/iii-org/akasha"]
emb_name = "openai:text-embedding-3-small"
chunk_size = 1000
### create chromadb ###
db, ignore_files = adb.process_db(
data_source=data_source,
embeddings=emb_name,
chunk_size=chunk_size,
verbose=True,
)

delete_num = adb.delete_documents_by_file(
"docs/1.pdf", emb_name , chunk_size,
)

delete_num = adb.delete_documents_by_directory(
"docs/mic", emb_name, chunk_size
)