cyberes revised this gist . Go to revision
1 file changed, 169 insertions
sillytavern-chat-to-txt-gpt4.py(file created)
@@ -0,0 +1,169 @@ | |||
1 | + | #!/usr/bin/env python3 | |
2 | + | import argparse | |
3 | + | import re | |
4 | + | from pathlib import Path | |
5 | + | import sys | |
6 | + | import json | |
7 | + | import openai | |
8 | + | import tiktoken | |
9 | + | import threading | |
10 | + | import traceback | |
11 | + | ||
12 | + | """ | |
13 | + | Convert SillyTavern jsonl chats to TXT files using AI. | |
14 | + | ||
15 | + | HOW TO USE: | |
16 | + | 1. `pip install tiktoken openai` | |
17 | + | 2. Find the chat file you want to convert. It's the `jsonl` file located in `SillyTavern/public/chats/<character name>/` | |
18 | + | 3. Run this script with `python3 sillytavern-chat-to-txt.py <path to the jsonl file> <path to where you want to save the TXT file> --key <your OpenAI API key>` | |
19 | + | ||
20 | + | This uses a temperature of 0 so don't re-run this expecting something different. | |
21 | + | ||
22 | + | If your chat is larger than the context window it will be sent in batches. After each batch, the response is written to your output file. | |
23 | + | """ | |
24 | + | ||
25 | + | class TimerThread(threading.Thread): | |
26 | + | def __init__(self, prompt:str='Waiting for response...'): | |
27 | + | super().__init__() | |
28 | + | self._stop_event = threading.Event() | |
29 | + | self.prompt = prompt | |
30 | + | ||
31 | + | def run(self): | |
32 | + | seconds = 0 | |
33 | + | while not self._stop_event.is_set(): | |
34 | + | print(f"\r{self.prompt} {seconds}s", end="") | |
35 | + | seconds += 1 | |
36 | + | self._stop_event.wait(1) | |
37 | + | ||
38 | + | def stop(self): | |
39 | + | self._stop_event.set() | |
40 | + | print('') | |
41 | + | ||
42 | + | ||
43 | + | def count_tokens(string: str, encoding_name: str = 'cl100k_base', encoding_for_model: str = None) -> int: | |
44 | + | if encoding_for_model: | |
45 | + | enc = tiktoken.encoding_for_model(encoding_for_model) | |
46 | + | else: | |
47 | + | enc = tiktoken.get_encoding(encoding_name) | |
48 | + | num_tokens = len(enc.encode(string)) | |
49 | + | return num_tokens | |
50 | + | ||
51 | + | def array_of_dicts_to_jsonl(array_of_dicts): | |
52 | + | jsonl_string = "\n".join(json.dumps(d) for d in array_of_dicts) | |
53 | + | return jsonl_string | |
54 | + | ||
55 | + | def send_to_openai(msg, model): | |
56 | + | user_msg = f"I have a jsonl transcript of an internet roleplay session. I need you to strip everything that isn't important to the story and write a summary of each message. For each message, use the format:\n\n```\n<character name>:\n<what the character says and important actions. don't use double linebreaks except for separating characters>```\n\nSome messages include things that other characters say so please organize it accordingly.\n\n{msg}" | |
57 | + | timer_thread = TimerThread(prompt=f'Sending {count_tokens(user_msg)} tokens to the AI...') | |
58 | + | timer_thread.start() | |
59 | + | try: | |
60 | + | response = openai.ChatCompletion.create( | |
61 | + | model=model, | |
62 | + | messages=[ | |
63 | + | {"role": "user", "content": user_msg} | |
64 | + | ], | |
65 | + | temperature=0, | |
66 | + | ) | |
67 | + | except Exception as e: | |
68 | + | print('Exception:', e) | |
69 | + | sys.exit(1) | |
70 | + | ||
71 | + | timer_thread.stop() | |
72 | + | return response | |
73 | + | ||
74 | + | def main(): | |
75 | + | parser = argparse.ArgumentParser(description='Convert SillyTavern jsonl files to TXT files using AI for importing into the infinite context server.') | |
76 | + | parser.add_argument('filepath', help='The path to the jsonl file to parse') | |
77 | + | parser.add_argument('output_txt', help='The output TXT file to create.') | |
78 | + | parser.add_argument('--key', required=True, help='Your OpenAI API key') | |
79 | + | parser.add_argument('--model', default='gpt-4', help='Name of the OpenAI model to use. GPT-4 seems to work the best for this. Default: gpt-4') | |
80 | + | args = parser.parse_args() | |
81 | + | ||
82 | + | openai.api_key = args.key | |
83 | + | ||
84 | + | input_jsonl = Path(args.filepath).expanduser().absolute().resolve() | |
85 | + | output_txt = Path(args.output_txt).expanduser().absolute().resolve() | |
86 | + | ||
87 | + | # Empty the file since we append to it | |
88 | + | output_txt.unlink() | |
89 | + | output_txt.touch() | |
90 | + | ||
91 | + | print('Converting chat:', input_jsonl) | |
92 | + | print('Using model:', args.model) | |
93 | + | ||
94 | + | if not input_jsonl.exists(): | |
95 | + | print('Input file does not exist:', input_jsonl) | |
96 | + | sys.exit(1) | |
97 | + | if not output_txt.parent.exists(): | |
98 | + | print('Output parent directory does not exist:', output_txt.parent) | |
99 | + | sys.exit(1) | |
100 | + | ||
101 | + | if args.model == "gpt-3.5-turbo" or args.model == "text-davinci-003": | |
102 | + | max_tokens = 3050 / 2 # div. by 2 since input+output tokens | |
103 | + | elif args.model == "gpt-4": | |
104 | + | max_tokens = 8050 / 2 | |
105 | + | else: | |
106 | + | print('Unknown model:', args.model) | |
107 | + | sys.exit(1) | |
108 | + | ||
109 | + | chatlines = [] | |
110 | + | total_tokens = 0 | |
111 | + | raw = input_jsonl.read_text().splitlines() | |
112 | + | for i in range(len(raw)): | |
113 | + | try: | |
114 | + | tmp = json.loads(raw[i]) | |
115 | + | ||
116 | + | # We don't want metadata messages | |
117 | + | if 'mes' not in tmp.keys(): | |
118 | + | continue | |
119 | + | ||
120 | + | # Trim the message down to save tokens | |
121 | + | msg = json.dumps({'name': tmp['name'], 'mes': tmp['mes']}) | |
122 | + | ||
123 | + | # We can't split messages so if one is larger than the context limit we have to quit | |
124 | + | token_count = count_tokens(msg) | |
125 | + | total_tokens += token_count | |
126 | + | if token_count > max_tokens: | |
127 | + | print('Message on line', i + 1, 'is too long at', 'token_count', 'tokens. Max tokens is', max_tokens, 'You need to decide out how to handle this.') | |
128 | + | sys.exit(1) | |
129 | + | chatlines.append(json.loads(msg)) | |
130 | + | except json.decoder.JSONDecodeError: | |
131 | + | print(f'JSON decode error on line {i + 1}:') | |
132 | + | sys.exit(1) | |
133 | + | ||
134 | + | num_chat_messages = len(chatlines) | |
135 | + | print('Total tokens:', total_tokens) | |
136 | + | ||
137 | + | while len(chatlines): | |
138 | + | ai_input_data = [] | |
139 | + | output_data = [] | |
140 | + | while True: | |
141 | + | # Check that the message fits in the max tokens | |
142 | + | ai_input_data.append(chatlines[0]) | |
143 | + | ai_input = array_of_dicts_to_jsonl(ai_input_data) | |
144 | + | token_count = count_tokens(ai_input) | |
145 | + | ||
146 | + | if token_count <= max_tokens: | |
147 | + | # Only remove a message if we fit it in the context | |
148 | + | del chatlines[0] | |
149 | + | else: | |
150 | + | # If the message is at the max token count, remove the item we just put in (we will process it next iteration) | |
151 | + | del ai_input_data[0] | |
152 | + | output_data.append(send_to_openai(ai_input, args.model).choices[0].message.content.strip()) | |
153 | + | break | |
154 | + | ||
155 | + | # If there aren't any more messages to process that means they all fit in the context | |
156 | + | if len(chatlines) == 0: | |
157 | + | output_data.append(send_to_openai(ai_input, args.model).choices[0].message.content.strip()) | |
158 | + | break | |
159 | + | ||
160 | + | if len(output_data): | |
161 | + | with open(output_txt, 'a') as f: | |
162 | + | for msg in output_data: | |
163 | + | f.write(f"{msg}\n\n\n") | |
164 | + | ||
165 | + | print(f'Converted {num_chat_messages} lines.') | |
166 | + | print('Saved to:', output_txt) | |
167 | + | ||
168 | + | if __name__ == "__main__": | |
169 | + | main() |
Newer
Older