Include token count in api.Chunk

And calculate the tokens/chunk for gemini responses, fixing the tok/s meter for gemini models. Further, only consider the first candidate of streamed gemini responses.
2024-06-09 20:45:18 +00:00
parent 42c3297e54
commit dfe43179c0
6 changed files with 26 additions and 16 deletions
--- a/pkg/api/provider/google/google.go
+++ b/pkg/api/provider/google/google.go
@@ -366,6 +366,8 @@ func (c *Client) CreateChatCompletionStream(
 	var toolCalls []FunctionCall

 	reader := bufio.NewReader(resp.Body)
+
+	lastTokenCount := 0
 	for {
 		line, err := reader.ReadBytes('\n')
 		if err != nil {
@@ -382,22 +384,25 @@ func (c *Client) CreateChatCompletionStream(

 		line = bytes.TrimPrefix(line, []byte("data: "))

-		var streamResp GenerateContentResponse
-		err = json.Unmarshal(line, &streamResp)
+		var resp GenerateContentResponse
+		err = json.Unmarshal(line, &resp)
 		if err != nil {
 			return "", err
 		}

-		for _, candidate := range streamResp.Candidates {
-			for _, part := range candidate.Content.Parts {
-				if part.FunctionCall != nil {
-					toolCalls = append(toolCalls, *part.FunctionCall)
-				} else if part.Text != "" {
-					output <- api.Chunk {
-						Content: part.Text,
-					}
-					content.WriteString(part.Text)
+		tokens := resp.UsageMetadata.CandidatesTokenCount - lastTokenCount
+		lastTokenCount += tokens
+
+		choice := resp.Candidates[0]
+		for _, part := range choice.Content.Parts {
+			if part.FunctionCall != nil {
+				toolCalls = append(toolCalls, *part.FunctionCall)
+			} else if part.Text != "" {
+				output <- api.Chunk{
+					Content:    part.Text,
+					TokenCount: uint(tokens),
 				}
+				content.WriteString(part.Text)
 			}
 		}
 	}