LLama.cpp is a popular simplified implementation of LLaMa models in pure C++/C. It already has ctypes-based Python bindings as well Go, Node.js.

This is a just a quick note that I’ve implemented basic SWIG-based wrapper which amongst other can build Java/JVM binding (already implemented) and also C#/CLR, and many other languages. The binding is available at: https://github.com/bnikolic/llama.swig

A basic example demonstrating the wrapper:

/* This follows the example "simple.cpp" */

package examples;

import uk.co.bnikolic.LLamaWrap;
import uk.co.bnikolic.llama_context_params;
import uk.co.bnikolic.gpt_params;
import uk.co.bnikolic.vectorll;
import uk.co.bnikolic.llama_token_data;
import uk.co.bnikolic.llama_token_data_array;
import uk.co.bnikolic.floatArray;

public class Simple {
    public static void main(String[] args) throws Exception {

	System.out.println(LLamaWrap.llama_print_system_info());

	gpt_params params = new gpt_params();	
	params.setPrompt( "Hello my name is");
	params.setModel("models/ggml-vicuna-7b-1.1-q4_0.bin");

	LLamaWrap.llama_init_backend();

	var ctx = LLamaWrap.llama_init_from_gpt_params( params );

	var tokens_list = LLamaWrap.llama_tokenize( ctx , params.getPrompt() , true );

	int max_context_size     = LLamaWrap.llama_n_ctx( ctx );
	int max_tokens_list_size = max_context_size - 4 ;

	for( var  id : tokens_list )
	    {
		System.out.println( LLamaWrap.llama_token_to_str( ctx , id ) );
	    }


	while ( LLamaWrap.llama_get_kv_cache_token_count( ctx ) < max_context_size )
	    {
        //---------------------------------
        // Evaluate the tokens :
        //---------------------------------

        if ( LLamaWrap.llama_eval( ctx ,
				   tokens_list.data() ,
				   tokens_list.size() ,
				   LLamaWrap.llama_get_kv_cache_token_count( ctx ) ,
				   params.getN_threads() ) != 0 )
	    System.out.println("failed");


        tokens_list.clear();

        //---------------------------------
        // Select the best prediction :
        //---------------------------------

        int new_token_id = 0;

        var logits  = floatArray.frompointer(LLamaWrap.llama_get_logits( ctx ));
        var n_vocab = LLamaWrap.llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)

        vectorll candidates = new vectorll();
        candidates.reserve( n_vocab );

        for( int token_id = 0 ; token_id < n_vocab ; token_id++ )
        {
	    llama_token_data d = new llama_token_data();
	    d.setId(token_id);
	    d.setLogit(logits.getitem( token_id ));
	    d.setP(0.0f);
            candidates.emplace_back( d);
        }

        llama_token_data_array  candidates_p = new llama_token_data_array();
	candidates_p.setData(candidates.data());
	candidates_p.setSize(candidates.size());
	candidates_p.setSorted(false);

        // Select it using the "Greedy sampling" method :
        new_token_id = LLamaWrap.llama_sample_token_greedy( ctx , candidates_p );


        // is it an end of stream ?
        if ( new_token_id == LLamaWrap.llama_token_eos() )
        {
	    System.out.println("end");
        }


        // Print the new token :
	System.out.println(LLamaWrap.llama_token_to_str( ctx , new_token_id ) );


        // Push this new token for next evaluation :
        tokens_list.push_back( new_token_id );

    } // wend of main loop

	
	

    }

}


LD_LIBRARY_PATH=. java -cp "LLamaWrap.jar:." examples.Simple
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |