{
  "@context": {
    "@vocab": "http://schema.org/",
    "wikidata": "http://www.wikidata.org/entity/",
    "wdt": "http://www.wikidata.org/prop/direct/",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "schema": "http://schema.org/",
    "wikibase": "http://wikiba.se/ontology#",
    "skos": "http://www.w3.org/2004/02/skos/core#"
  },
  "@type": "ScholarlyArticle",
  "headline": "Using Knowledge Graphs to harvest datasets for efficient CLIP model training",
  "author": [
    {
      "@type": "Person",
      "name": "Simon Ging",
      "email": "gings@cs.uni-freiburg.de",
      "affiliation": {
        "@type": "CollegeOrUniversity",
        "name": "University of Freiburg, Germany"
      }
    },
    {
      "@type": "Person",
      "name": "Sebastian Walter",
      "affiliation": {
        "@type": "CollegeOrUniversity",
        "name": "University of Freiburg, Germany"
      }
    },
    {
      "@type": "Person",
      "name": "Jelena Bratulić",
      "affiliation": {
        "@type": "CollegeOrUniversity",
        "name": "University of Freiburg, Germany"
      }
    },
    {
      "@type": "Person",
      "name": "Johannes Dienert",
      "affiliation": {
        "@type": "CollegeOrUniversity",
        "name": "University of Freiburg, Germany"
      }
    },
    {
      "@type": "Person",
      "name": "Hannah Bast",
      "affiliation": {
        "@type": "CollegeOrUniversity",
        "name": "University of Freiburg, Germany"
      }
    },
    {
      "@type": "Person",
      "name": "Thomas Brox",
      "affiliation": {
        "@type": "CollegeOrUniversity",
        "name": "University of Freiburg, Germany"
      }
    }
  ],
  "datePublished": "2025-09-30",
  "url": "https://entity-net.github.io",
  "abstract": "Training high-quality CLIP models typically requires enormous datasets, limiting domain-specific model development and increasing costs. Using knowledge graphs and smart web search, a robust CLIP model can be trained from scratch with significantly less data. The introduced EntityNet dataset contains 33M images with 46M text descriptions, enabling efficient training of generic and expert domain CLIP models.",
  "articleBody": "This work presents a method to harvest datasets for CLIP training using knowledge graphs and targeted web image search, enabling efficient training of expert and generic domain CLIP models with significantly less data and compute.",
  "hasPart": [
    {
      "@type": "DefinedTermSet",
      "name": "Defined Terms for CLIP Dataset Creation",
      "description": "Key terms and concepts used in the dataset creation process.",
      "hasDefinedTerm": [
        {
          "@type": "DefinedTerm",
          "name": "Entity",
          "description": "A visual concept or object extracted from knowledge graphs such as Wikidata or WordNet, e.g., 'eagle'."
        },
        {
          "@type": "DefinedTerm",
          "name": "Attribute",
          "description": "Visual properties or contexts generated for entities, e.g., 'small', 'running', used to create search queries."
        },
        {
          "@type": "DefinedTerm",
          "name": "Natural Type",
          "description": "The super-entity or category most associated with an entity, used to disambiguate search queries."
        },
        {
          "@type": "DefinedTerm",
          "name": "Alt Text",
          "description": "Text descriptions extracted from HTML image tags associated with images found via web search."
        },
        {
          "@type": "DefinedTerm",
          "name": "Knowledge Graph",
          "description": "Structured data sources like Wikidata and WordNet used to extract entities and attributes."
        },
        {
          "@type": "DefinedTerm",
          "name": "EntityNet",
          "description": "The dataset created using the proposed method, containing 33M images paired with 45M alt texts and knowledge graph labels."
        },
        {
          "@type": "DefinedTerm",
          "name": "CLIP Model",
          "description": "Contrastive Language-Image Pretraining model linking images and text for vision-language tasks."
        },
        {
          "@type": "DefinedTerm",
          "name": "Expert Domain",
          "description": "A specialized area such as living organisms where domain-specific CLIP models are trained."
        },
        {
          "@type": "DefinedTerm",
          "name": "Generic Domain",
          "description": "Broad visual world domain covering diverse categories like tools, materials, and buildings."
        },
        {
          "@type": "DefinedTerm",
          "name": "Training MACs",
          "description": "Multiply–accumulate operations used as a measure of computational training cost."
        }
      ]
    },
    {
      "@type": "Question",
      "name": "What is the main challenge addressed by the paper?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "The challenge of training high-quality CLIP models efficiently with less data, especially for expert domains where large datasets are scarce."
      }
    },
    {
      "@type": "Question",
      "name": "How does the proposed method create datasets for CLIP training?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "By extracting entities and attributes from knowledge graphs, generating search queries, using web image search to collect images and alt texts, and linking images back to knowledge graph information."
      }
    },
    {
      "@type": "Question",
      "name": "What knowledge graphs are used in the dataset creation?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Wikidata and WordNet."
      }
    },
    {
      "@type": "Question",
      "name": "What is EntityNet?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "A dataset comprising 33 million images paired with 45 million alt texts and 613k text labels from knowledge graphs, partitioned into expert and generic domain subsets."
      }
    },
    {
      "@type": "Question",
      "name": "How does the model trained on EntityNet perform compared to other CLIP models?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "It achieves higher performance in expert domains like living organisms and comparable performance in generic domains with significantly less training compute."
      }
    },
    {
      "@type": "Question",
      "name": "What are the main steps in the dataset creation process?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Entity extraction, attribute generation, query building, and image search with filtering."
      }
    },
    {
      "@type": "Question",
      "name": "How are attributes generated for entities?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Using large language models (LLMs) prompted with entity and attribute information extracted from Wikidata to generate visual attributes categorized into six groups."
      }
    },
    {
      "@type": "Question",
      "name": "What image search engines are used?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Bing Image Search API is primarily used due to higher quality and lower cost; Google Image Search API was also tested."
      }
    },
    {
      "@type": "Question",
      "name": "What filtering and deduplication methods are applied to the dataset?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Filtering out JSON-like or too long alt texts, images with extreme aspect ratios or low resolution, and deduplication using the Self-Supervised Descriptor for Image Copy Detection (SSCD)."
      }
    },
    {
      "@type": "Question",
      "name": "What are the limitations of the proposed approach?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Requires existence of a knowledge graph and searchable image-text data; some drop in generic domain retrieval performance; focus on photos limits performance on paintings and sketches."
      }
    },
    {
      "@type": "HowTo",
      "name": "How to create a vision-language dataset using knowledge graphs",
      "description": "Steps to build a dataset for CLIP training using knowledge graphs and web image search.",
      "hasPart": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Extract entities",
          "text": "Extract entities from knowledge graphs like Wikidata using subclass and parent taxon relations, filtering by sitelink count and visual relevance."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Generate attributes",
          "text": "Use LLMs to generate visual attributes for entities, categorized into color, pattern, parts, shape, environment, and others."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Build search queries",
          "text": "Combine entities, attributes, and natural types to form search queries for image search engines."
        },
        {
          "@type": "HowToStep",
          "position": 4,
          "name": "Search and collect images",
          "text": "Use Bing and Google image search APIs to collect images and alt texts for the queries."
        },
        {
          "@type": "HowToStep",
          "position": 5,
          "name": "Filter and deduplicate",
          "text": "Filter out low-quality images and texts, remove duplicates using SSCD, and exclude images in evaluation datasets."
        }
      ]
    },
    {
      "@type": "HowTo",
      "name": "How to train a CLIP model on EntityNet",
      "description": "Training procedure for CLIP models using the EntityNet dataset.",
      "hasPart": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Prepare data",
          "text": "Sample text labels from alt texts and knowledge graph labels with a 50-50 probability."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Set training parameters",
          "text": "Use batch size 8192, train for 18 epochs with random resized crop augmentation."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Train model",
          "text": "Train ViT-B/32 or ViT-B/16 CLIP models from scratch on 8 GPUs for approximately 55 hours."
        }
      ]
    },
    {
      "@type": "HowTo",
      "name": "How to evaluate CLIP models",
      "description": "Evaluation protocols for zero-shot object classification and image-text retrieval.",
      "hasPart": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Zero-shot classification",
          "text": "Encode class names and optionally use context prompts; predict class with highest cosine similarity to image embedding."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Image-text retrieval",
          "text": "Evaluate recall@1 for image-to-text and text-to-image retrieval on datasets like COCO, Flickr30k, and XM3600."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Domain robustness",
          "text": "Test model robustness on distribution shift datasets such as ImageNet-A, ImageNet-R, ImageNet-Sketch, ImageNet-V2, and ObjectNet."
        }
      ]
    },
    {
      "@type": "Dataset",
      "name": "EntityNet",
      "description": "A large-scale vision-language dataset created using knowledge graphs and web image search, containing 33 million images paired with 45 million alt texts and 613k knowledge graph labels.",
      "url": "https://entity-net.github.io",
      "distribution": {
        "@type": "DataDownload",
        "contentUrl": "https://entity-net.github.io"
      },
      "variableMeasured": [
        {
          "@type": "PropertyValue",
          "name": "Images",
          "value": "33 million"
        },
        {
          "@type": "PropertyValue",
          "name": "Alt texts",
          "value": "45 million"
        },
        {
          "@type": "PropertyValue",
          "name": "Knowledge graph labels",
          "value": "613k"
        }
      ],
      "hasPart": [
        {
          "@type": "Dataset",
          "name": "Living entity subset",
          "description": "Subset of 10 million images focused on living organisms including animals, plants, and fungi."
        },
        {
          "@type": "Dataset",
          "name": "World entity subset",
          "description": "Subset of 23 million images covering a wide range of categories such as tools, geographical features, materials, and buildings."
        }
      ]
    },
    {
      "@type": "SoftwareSourceCode",
      "name": "EntityNet CLIP Training Code",
      "url": "https://entity-net.github.io",
      "programmingLanguage": "Python",
      "description": "Code for dataset creation and training CLIP models on EntityNet."
    },
    {
      "@type": "CreativeWork",
      "name": "SPARQL Query for Entity Extraction",
      "description": "SPARQL query used to extract entities from Wikidata related to a given super-entity, filtering by sitelinks and language.",
      "text": "PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX schema: <http://schema.org/> PREFIX wikibase: <http://wikiba.se/ontology#> PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT DISTINCT ?ent ?label ?desc ?links (GROUP_CONCAT(DISTINCT ?alias; SEPARATOR=';;;') AS ?aliases) WHERE { VALUES ?typ { wd:Q42889 } ?ent wdt:P279* ?typ . ?ent rdfs:label ?label . FILTER(LANG(?label) = 'en') ?ent ^schema:about/wikibase:sitelinks ?links . FILTER(?links >= 5) OPTIONAL { ?ent schema:description ?desc . FILTER(LANG(?desc) = 'en') } OPTIONAL { ?ent skos:altLabel ?alias . FILTER(LANG(?alias) = 'en') } } GROUP BY ?ent ?label ?desc ?links ORDER BY DESC(?links)"
    }
  ],
  "publisher": {
    "@type": "Organization",
    "name": "arXiv",
    "url": "https://arxiv.org/abs/2505.02746v3"
  },
  "mainEntity": {
    "@type": "CreativeWork",
    "name": "Using Knowledge Graphs to harvest datasets for efficient CLIP model training",
    "about": [
      "Vision-Language Models",
      "CLIP training",
      "Knowledge Graphs",
      "Dataset creation",
      "EntityNet dataset",
      "Expert domain modeling",
      "Image search",
      "Large Language Models",
      "Zero-shot classification",
      "Image-text retrieval"
    ]
  }
}