Last active 1686796559

Convert SillyTavern jsonl chats to TXT files using AI.

cyberes's Avatar cyberes revised this gist 1686796559. Go to revision

1 file changed, 169 insertions created)

@@ -0,0 +1,169 @@
1 + #!/usr/bin/env python3
2 + import argparse
3 + import re
4 + from pathlib import Path
5 + import sys
6 + import json
7 + import openai
8 + import tiktoken
9 + import threading
10 + import traceback
11 +
12 + """
13 + Convert SillyTavern jsonl chats to TXT files using AI.
14 +
15 + HOW TO USE:
16 + 1. `pip install tiktoken openai`
17 + 2. Find the chat file you want to convert. It's the `jsonl` file located in `SillyTavern/public/chats/<character name>/`
18 + 3. Run this script with `python3 <path to the jsonl file> <path to where you want to save the TXT file> --key <your OpenAI API key>`
19 +
20 + This uses a temperature of 0 so don't re-run this expecting something different.
21 +
22 + If your chat is larger than the context window it will be sent in batches. After each batch, the response is written to your output file.
23 + """
24 +
25 + class TimerThread(threading.Thread):
26 + def __init__(self, prompt:str='Waiting for response...'):
27 + super().__init__()
28 + self._stop_event = threading.Event()
29 + self.prompt = prompt
30 +
31 + def run(self):
32 + seconds = 0
33 + while not self._stop_event.is_set():
34 + print(f"\r{self.prompt} {seconds}s", end="")
35 + seconds += 1
36 + self._stop_event.wait(1)
37 +
38 + def stop(self):
39 + self._stop_event.set()
40 + print('')
41 +
42 +
43 + def count_tokens(string: str, encoding_name: str = 'cl100k_base', encoding_for_model: str = None) -> int:
44 + if encoding_for_model:
45 + enc = tiktoken.encoding_for_model(encoding_for_model)
46 + else:
47 + enc = tiktoken.get_encoding(encoding_name)
48 + num_tokens = len(enc.encode(string))
49 + return num_tokens
50 +
51 + def array_of_dicts_to_jsonl(array_of_dicts):
52 + jsonl_string = "\n".join(json.dumps(d) for d in array_of_dicts)
53 + return jsonl_string
54 +
55 + def send_to_openai(msg, model):
56 + user_msg = f"I have a jsonl transcript of an internet roleplay session. I need you to strip everything that isn't important to the story and write a summary of each message. For each message, use the format:\n\n```\n<character name>:\n<what the character says and important actions. don't use double linebreaks except for separating characters>```\n\nSome messages include things that other characters say so please organize it accordingly.\n\n{msg}"
57 + timer_thread = TimerThread(prompt=f'Sending {count_tokens(user_msg)} tokens to the AI...')
58 + timer_thread.start()
59 + try:
60 + response = openai.ChatCompletion.create(
61 + model=model,
62 + messages=[
63 + {"role": "user", "content": user_msg}
64 + ],
65 + temperature=0,
66 + )
67 + except Exception as e:
68 + print('Exception:', e)
69 + sys.exit(1)
70 +
71 + timer_thread.stop()
72 + return response
73 +
74 + def main():
75 + parser = argparse.ArgumentParser(description='Convert SillyTavern jsonl files to TXT files using AI for importing into the infinite context server.')
76 + parser.add_argument('filepath', help='The path to the jsonl file to parse')
77 + parser.add_argument('output_txt', help='The output TXT file to create.')
78 + parser.add_argument('--key', required=True, help='Your OpenAI API key')
79 + parser.add_argument('--model', default='gpt-4', help='Name of the OpenAI model to use. GPT-4 seems to work the best for this. Default: gpt-4')
80 + args = parser.parse_args()
81 +
82 + openai.api_key = args.key
83 +
84 + input_jsonl = Path(args.filepath).expanduser().absolute().resolve()
85 + output_txt = Path(args.output_txt).expanduser().absolute().resolve()
86 +
87 + # Empty the file since we append to it
88 + output_txt.unlink()
89 + output_txt.touch()
90 +
91 + print('Converting chat:', input_jsonl)
92 + print('Using model:', args.model)
93 +
94 + if not input_jsonl.exists():
95 + print('Input file does not exist:', input_jsonl)
96 + sys.exit(1)
97 + if not output_txt.parent.exists():
98 + print('Output parent directory does not exist:', output_txt.parent)
99 + sys.exit(1)
100 +
101 + if args.model == "gpt-3.5-turbo" or args.model == "text-davinci-003":
102 + max_tokens = 3050 / 2 # div. by 2 since input+output tokens
103 + elif args.model == "gpt-4":
104 + max_tokens = 8050 / 2
105 + else:
106 + print('Unknown model:', args.model)
107 + sys.exit(1)
108 +
109 + chatlines = []
110 + total_tokens = 0
111 + raw = input_jsonl.read_text().splitlines()
112 + for i in range(len(raw)):
113 + try:
114 + tmp = json.loads(raw[i])
115 +
116 + # We don't want metadata messages
117 + if 'mes' not in tmp.keys():
118 + continue
119 +
120 + # Trim the message down to save tokens
121 + msg = json.dumps({'name': tmp['name'], 'mes': tmp['mes']})
122 +
123 + # We can't split messages so if one is larger than the context limit we have to quit
124 + token_count = count_tokens(msg)
125 + total_tokens += token_count
126 + if token_count > max_tokens:
127 + print('Message on line', i + 1, 'is too long at', 'token_count', 'tokens. Max tokens is', max_tokens, 'You need to decide out how to handle this.')
128 + sys.exit(1)
129 + chatlines.append(json.loads(msg))
130 + except json.decoder.JSONDecodeError:
131 + print(f'JSON decode error on line {i + 1}:')
132 + sys.exit(1)
133 +
134 + num_chat_messages = len(chatlines)
135 + print('Total tokens:', total_tokens)
136 +
137 + while len(chatlines):
138 + ai_input_data = []
139 + output_data = []
140 + while True:
141 + # Check that the message fits in the max tokens
142 + ai_input_data.append(chatlines[0])
143 + ai_input = array_of_dicts_to_jsonl(ai_input_data)
144 + token_count = count_tokens(ai_input)
145 +
146 + if token_count <= max_tokens:
147 + # Only remove a message if we fit it in the context
148 + del chatlines[0]
149 + else:
150 + # If the message is at the max token count, remove the item we just put in (we will process it next iteration)
151 + del ai_input_data[0]
152 + output_data.append(send_to_openai(ai_input, args.model).choices[0].message.content.strip())
153 + break
154 +
155 + # If there aren't any more messages to process that means they all fit in the context
156 + if len(chatlines) == 0:
157 + output_data.append(send_to_openai(ai_input, args.model).choices[0].message.content.strip())
158 + break
159 +
160 + if len(output_data):
161 + with open(output_txt, 'a') as f:
162 + for msg in output_data:
163 + f.write(f"{msg}\n\n\n")
164 +
165 + print(f'Converted {num_chat_messages} lines.')
166 + print('Saved to:', output_txt)
167 +
168 + if __name__ == "__main__":
169 + main()
Newer Older