"adultery_state_laws":"This evaluation checks the model's ability to accurately answer true or false questions about adultery laws in various states.",
"afrikaans-lexicon":"Test the model's ability to distinguish between existing Afrikaans words.",
"aime_evaluation":"Test the model's ability to solve math problems from the AIME competition.",
"algebra-word-problems":"Test the model's ability to perform basic algebra word problems",
"ambiguous-sentences":"test pair of sentences that differ in only one or two words and that contain an ambiguity that is resolved in opposite ways in the two sentences and requires the use of world knowledge and reasoning for its resolution.",
"arithmetical_puzzles":"Test the model's ability to solve complex arithmetical puzzles stated in natural language.",
"asl-classifiers":"Test the model's ability to understand the usage of ASL classifiers.",
"belarusian-synonyms":"Test the model's ability to classify if the Belarusian words are synonyms or not.",
"bitwise":"Test the model's ability to simulate a simple bitwise operating machine",
"body-movement":"Test the model's ability to understand human body movement",
"born-first":"Test the model's ability to determine who was born first.",
"brazilian_laws":null,
"building_floorplan":null,
"bulgarian-lexicon":"Test the model's ability to distinguish between existing and hallucinated Bulgarian words.",
"canto_wu_pronunciation":"Test the model's knowledge of Cantonenese and Wu Chinese pronounciation in a zero-shot setting",
"chess":"Test the model's ability to play chess",
"chess-piece-count":"Test the model's ability to understand chess moves, rules and theory",
"chinese_tang_poetries":"Evaluate the mobel's ability of identifying the accurate author of Chinese Tang Poetries.",
"chinese_zodiac":null,
"color_theory_complementary":"Test the model's ability to accurately recognize complementary colors in the color theory.",
"compare-countries-area":"Test the model's ability to determine which country has the largest area.",
"comprehensive-graph-reasoning":"Test the model's ability to identify the number of rings and clusters, and the shortest path between two random nodes in undirected, weighted graphs.",
"count_intersections_polynomial":"Test the models ability to count the intersections between the x-axis and a polynomial of third degree, with simple inputs that humans would be able to do in their head.",
"count_token_freq_dna":"Test the model's ability to count the occurrence of a specific nucleotide (A, T, G, or C) within provided DNA sequences.",
"counterfactual-reasoning":"Example eval that uses fuzzy matching to score completions.",
"countries":null,
"crepe":null,
"cricket_situations":"Tests the models ability to apply rules of the sport cricket to different situations",
"crontab":null,
"cube-pack":null,
"date-booking":null,
"date-calculator":null,
"day-of-week-from-date":null,
"determinant":null,
"diagrammatic_logic":null,
"directions":"Eval that tests the models ability to keep state of direction after a series of turns",
"dna-melting-calculation":"Test the model's ability to solve DNA melting temperature problems.",
"emoji-riddle":"Test the model's ability to solve emoji riddles.",
"escher-sentences":null,
"european-date-format-challenge":"This performance evaluation examines the model's ability to reasonably assume that a date in a text follows the DD/MM/YYYY format when a subsequent date in the text is invalid for the MM/DD/YYYY format (e.g., 27/2/2024).",
"fcc_amateur_extra":"Multiple choice questions (with answers) about from the US FCC Amateur Radio License question pool.",
"finance":"Test the model's ability to understand financial concepts and do math.",
"financial-derivatives":"Testing the models ability to answer derivative questions correctly.",
"french-part-of-speech":"Test the model's knowledge what part of speech a given word can have in French, using data from fr.wiktionary.org (as of 2023-05-20)",
"geometry_puzzle":"Assesses the model's performance in solving spatial and geometrical puzzles that require imagination, logic, and pattern recognition.",
"german-part-of-speech":"Test the model's knowledge what part of speech a given word can have in German, using data from de.wiktionary.org (as of 2023-05-20)",
"gol":"Robust test. Evaluate model's ability to determine the next state in a simple game of life board",
"greek-vocabulary":null,
"guess-the-singer":"Test the model's ability to predict singer by the first 10 words of the song",
"heart-disease":"Test model's capability of predicting the presence of heart disease.",
"hebrew-rhyme":"Composite task that involves translation and rhyming.",
"hebrew-same-noun-gender":"Do these hebrew nouns have the same grammatical gender?",
"hindi_shuddha":null,
"hindi_words":null,
"historical-kana-orthography-reading":"Test the model's ability to reading historical kana orthography.",
"imperial_date_to_string":null,
"indonesian_numbers":null,
"infiniteloop-match":"Test the model's ability to recognized if a piece of code can get into a state where it would run forever.",
"internal_representations":null,
"invert_word_wise":"Logically, inverting strings twice just results in the original string again. The LLMs find it very difficult to deduce it, and somehow (at least up to GPT-3.5) mix things up.",
"invoice_due_date_leap_day_adjustment":null,
"irony":"Tests the ability to identify one of three types of irony, situational, verbal, or dramatic.",
"italian-new-words":"Test the model's ability to distinguish Italian words that have recently entered the language.",
"italian-rhyme":"Composite task that involves translation and rhyming.",
"japanese-itpassport-exam01":"source from IT\u30d1\u30b9\u30dd\u30fc\u30c8\u8a66\u9a13 \u4ee4\u548c5\u5e74\u5ea6\u5206(IT Passport Examination for FY2023) in https://www3.jitec.ipa.go.jp/JitesCbt/html/openinfo/questions.html",
"japanese-national-medical-exam01":null,
"japanese-national-medical-exam02":null,
"japanese_driving_license":"Test the model's ability to correctly answer Japanese Driving licence exam.",
"japanese_number_reading":"Test the model's ability to translate japanese written number into arabic numerals.",
"japanese_populer_video_game_title_and_the_publisher":"Test the model's ability to identify game publisher published popular japanese video games.",
"list_comparison_missing_name":"Test the model's ability to determine which name is present in list 1 but not in list 2. List 1 is formatted 'First Last' while list two is formatted 'Last First'. Lists are between 20-35 names long.",
"music-theory-chord-notes":"Test the model's ability to spell out the notes in a given chord name",
"music_theory_scale_modes":"Test the model's ability to identify which western music scale a series of 8 notes belongs to",
"nepali-song-singer":"Test the model's ability to understand English transliteration of Nepali phrase and provide us the singer of that particular title.",
"newsology":"Ask the model to pick a fruit, when telling the model that we have provided a list of vegetables. And then vice versa (pick vegetable, from basket of fruit).",
"shared-borders":"Test the model's ability to list the countries that share a land border with a given pair of countries. This tests the model's ability to intersect sets known within its weights.",
"shopping_discount_comparison":"Test the model's ability to compare discounts and select the best one",
"simple-knowledge-mongolian":"Test the model's ability to understand simple world knowledge in mongolian language cyrillic and latin variants",
"simple_physics_engine":"Test the model's ability to reason about and simulate a simplified physics model in a 2d environment.",
"solve-for-variable":"Multiple-choice questions about solving a mathematical equation for a variable.",
"sort-numeric":"Tests performance sorting different comma-separated values under different circumstances (integers/decimals, positives/negatives, as well as currency-formatted values).",
"south-african-bands":"Test the model's ability to understand that we are providing the name of a South African band, find the supplied band, and if the band has a lead vocalist provide the stage name or real name of the vocalist.",
"spanish_feminine_noun_masculine_article":"In Spanish there are are a number of nouns like \"agua\" which are feminine but use the masculine article, \"El agua\" is correct and \"La agua\" is incorrect",
"squares-gpt":"Test the model's ability to solve basic geometric reasoning questions.",
"stats-tests":null,
"svg_understanding":"Test visual understanding of SVG files.",
"swap-words":null,
"swedish-spelling":"Test the model's ability to identify misspelled Swedish words.",
"swedish_sat":"Test the model's ability to answer questions from the Swedish h\u00f6gskoleprovet, kind of like the SATs in the US. The 30 questions are from the spring test 2023 verbal part, test number 3.",
"syllables_long_words":null,
"syntax-check":"Test the model's ability to determine programming language from a snippet.",
"tempo_to_measure_count":"Test the model's ability to calculate the number of measures in a song, based on the tempo of each note and the corresponding time signature of the piece.",
"test-comp-sci":"Testing the models ability to answer multiple choice computer science questions correctly.",
"test_japanese_radical":"In Japan, the radical changes depending on the type of kanji. Test your reading of various radicals.",
"test_japanese_units":"In Japan, when counting things, the unit changes depending on the type. Test your use of complex units.",
"tetris":"Tests the models ability of spacial awareness by rotating tetris cubes. Tests all 7 classic tetris blocks and performs clockwise and counterclockwise rotations from different starting points.",
"vintage_phone_keyboard_decode":"An array of correspondence between letters and numbers on the mobile phone keyboard evals, examining the model the ability to distinguish and analyze the relationship within groups in multiple groups composed of English letters and numbers.",
"which-is-heavier":"Test the model's ability to determine which of two quantities is heavier when the heavier quantity is made up of lighter objects (and vice versa).",
"wkt_understanding":"Test understanding of Multipolygon WKT (Well-Known Text) representation of vector geometry objects (https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry).",
"word_vector_over_reliance":"Example eval that checks sampled text matches the expected output."