@base <https://openai.com/index/introducing-openai-privacy-filter/> .
@prefix schema: <https://schema.org/> .
@prefix owl: <https://www.w3.org/2002/07/owl#> .

<#article> a schema:Article ;
  schema:headline "Introducing OpenAI Privacy Filter"@en ;
  schema:name "Introducing OpenAI Privacy Filter"@en ;
  schema:datePublished "2026-04-22" ;
  schema:inLanguage "en" ;
  schema:url <https://openai.com/index/introducing-openai-privacy-filter/> ;
  schema:publisher <#openai> ;
  schema:author <#openai> ;
  schema:about
    <#openai-privacy-filter>,
    <#pii-detection>,
    <#local-redaction>,
    <#token-classification-architecture>,
    <#span-decoding>,
    <#privacy-taxonomy>,
    <#context-aware-redaction>,
    <#pii-masking-300k>,
    <#annotation-correction>,
    <#account-number-label>,
    <#secret-label>,
    <#apache-2-license>,
    <#privacy-by-design>,
    <#domain-adaptation>,
    <#benchmark-f1> ;
  schema:articleSection
    "A small model with frontier personal data detection capability"@en,
    "Model overview"@en,
    "How we built it"@en,
    "How Privacy Filter performs"@en,
    "Limitations"@en,
    "Availability"@en,
    "Looking ahead"@en ;
  schema:abstract """The post introduces OpenAI Privacy Filter, an open-weight model for context-aware detection and redaction of personally identifiable information in text, designed for high-throughput local privacy workflows."""@en ;
  schema:articleBody """OpenAI introduces Privacy Filter as a small, open-weight model for detecting and redacting personally identifiable information in text. The post positions it as privacy-preserving infrastructure that can run locally, process long documents efficiently in one pass, and outperform traditional rule-based PII systems on context-sensitive cases. The released model is described as a bidirectional token classifier with span decoding, built from an autoregressive pretrained checkpoint and trained on a mix of public and synthetic privacy-labeled data. OpenAI reports 96% F1 on PII-Masking-300k and 97.43% F1 on a corrected version of that benchmark, while also emphasizing that the model is not anonymization, legal compliance, or policy review by itself. The article frames Privacy Filter as a practical building block for safer AI pipelines such as training, indexing, logging, and review."""@en ;
  schema:hasPart
    <#part-capability>,
    <#part-architecture>,
    <#part-build>,
    <#part-performance>,
    <#part-limitations>,
    <#part-availability> ;
  schema:mentions
    <#defined-terms>,
    <#argument-howto>,
    <#faq-1>, <#faq-2>, <#faq-3>, <#faq-4>, <#faq-5>,
    <#faq-6>, <#faq-7>, <#faq-8>, <#faq-9>, <#faq-10>,
    <#openai>,
    <#hugging-face>,
    <#github> .

<#openai> a schema:Organization ;
  schema:name "OpenAI"@en ;
  schema:url <https://openai.com/> .

<#hugging-face> a schema:Organization ;
  schema:name "Hugging Face"@en ;
  schema:url <https://huggingface.co/> .

<#github> a schema:Organization ;
  schema:name "GitHub"@en ;
  schema:url <https://github.com/> .

<#openai-privacy-filter> a schema:SoftwareApplication, schema:Product ;
  schema:name "OpenAI Privacy Filter"@en ;
  schema:brand <#openai> ;
  schema:applicationCategory "Privacy filtering model"@en ;
  schema:description """The post describes Privacy Filter as an open-weight model for context-aware PII detection and masking in unstructured text that can run locally."""@en .

<#pii-detection> a schema:DefinedTerm ;
  schema:name "Personal data detection capability"@en ;
  schema:description """The article’s framing for Privacy Filter as a small model with frontier capability for detecting and redacting personally identifiable information in text."""@en .

<#local-redaction> a schema:DefinedTerm ;
  schema:name "Local redaction"@en ;
  schema:description """The ability to run Privacy Filter on device so raw text can be masked or redacted without leaving the local machine."""@en .

<#token-classification-architecture> a schema:DefinedTerm ;
  schema:name "Bidirectional token-classification architecture"@en ;
  schema:description """The model architecture described in the post: an autoregressive checkpoint adapted into a bidirectional token classifier over a privacy label taxonomy."""@en .

<#span-decoding> a schema:DefinedTerm ;
  schema:name "Constrained span decoding"@en ;
  schema:description """The Viterbi-based decoding procedure used to convert token-level predictions into coherent spans with BIOES boundary constraints."""@en .

<#privacy-taxonomy> a schema:DefinedTerm ;
  schema:name "Privacy taxonomy"@en ;
  schema:description """The label system used to define which kinds of personal identifiers, account numbers, and secrets the model should detect."""@en .

<#context-aware-redaction> a schema:DefinedTerm ;
  schema:name "Context-aware redaction"@en ;
  schema:description """The model’s ability to detect PII based on surrounding context rather than relying only on deterministic patterns like regexes."""@en .

<#pii-masking-300k> a schema:DefinedTerm ;
  schema:name "PII-Masking-300k benchmark"@en ;
  schema:description """The public benchmark used in the article to report Privacy Filter’s headline F1, precision, and recall results."""@en .

<#annotation-correction> a schema:DefinedTerm ;
  schema:name "Annotation correction"@en ;
  schema:description """The evaluation adjustment OpenAI applied after identifying label issues in the benchmark, resulting in a higher corrected F1 score."""@en .

<#account-number-label> a schema:DefinedTerm ;
  schema:name "account_number label"@en ;
  schema:description """The privacy label used to capture banking and financial account identifiers such as credit card and bank account numbers."""@en .

<#secret-label> a schema:DefinedTerm ;
  schema:name "secret label"@en ;
  schema:description """The privacy label used to capture secrets such as passwords and API keys in text and code-like content."""@en .

<#apache-2-license> a schema:DefinedTerm ;
  schema:name "Apache 2.0 availability"@en ;
  schema:description """The release condition highlighted in the article: the model is available as open weights under the Apache 2.0 license on Hugging Face and GitHub."""@en .

<#privacy-by-design> a schema:DefinedTerm ;
  schema:name "Privacy-by-design system"@en ;
  schema:description """The article’s reminder that Privacy Filter is one component in a broader privacy-preserving architecture rather than a standalone compliance guarantee."""@en .

<#domain-adaptation> a schema:DefinedTerm ;
  schema:name "Domain adaptation through fine-tuning"@en ;
  schema:description """The ability to fine-tune the model on small in-domain datasets to improve performance on organization-specific privacy tasks."""@en .

<#benchmark-f1> a schema:DefinedTerm ;
  schema:name "Benchmark F1 performance"@en ;
  schema:description """The headline metric family reported in the article: 96% F1 on PII-Masking-300k and 97.43% on a corrected version of that benchmark."""@en .

<#defined-terms> a schema:DefinedTermSet ;
  schema:name "Defined terms for Introducing OpenAI Privacy Filter"@en ;
  schema:hasPart
    <#openai-privacy-filter>,
    <#pii-detection>,
    <#local-redaction>,
    <#token-classification-architecture>,
    <#span-decoding>,
    <#privacy-taxonomy>,
    <#context-aware-redaction>,
    <#pii-masking-300k>,
    <#annotation-correction>,
    <#account-number-label>,
    <#secret-label>,
    <#apache-2-license>,
    <#privacy-by-design>,
    <#domain-adaptation>,
    <#benchmark-f1> ;
  schema:isPartOf <#article> .

<#part-capability> a schema:WebPageElement ;
  schema:name "A small model with frontier personal data detection capability"@en ;
  schema:position 1 ;
  schema:about <#openai-privacy-filter>, <#context-aware-redaction>, <#local-redaction> ;
  schema:text """The post positions Privacy Filter as a small local model that handles nuanced PII decisions in unstructured text better than narrow pattern-matching systems."""@en .

<#part-architecture> a schema:WebPageElement ;
  schema:name "Model overview"@en ;
  schema:position 2 ;
  schema:about <#token-classification-architecture>, <#span-decoding>, <#privacy-taxonomy> ;
  schema:text """The model is described as a bidirectional token classifier with constrained span decoding over eight privacy categories and support for up to 128,000 tokens."""@en .

<#part-build> a schema:WebPageElement ;
  schema:name "How we built it"@en ;
  schema:position 3 ;
  schema:about <#privacy-taxonomy>, <#domain-adaptation> ;
  schema:text """OpenAI says the model was trained on a mixture of public and synthetic data, with model-assisted annotation to improve incomplete labels and broaden subtype coverage."""@en .

<#part-performance> a schema:WebPageElement ;
  schema:name "How Privacy Filter performs"@en ;
  schema:position 4 ;
  schema:about <#pii-masking-300k>, <#annotation-correction>, <#benchmark-f1> ;
  schema:text """The article reports 96% F1 on PII-Masking-300k and 97.43% on a corrected benchmark, plus strong domain adaptation and targeted secret-detection results."""@en .

<#part-limitations> a schema:WebPageElement ;
  schema:name "Limitations"@en ;
  schema:position 5 ;
  schema:about <#privacy-by-design> ;
  schema:text """OpenAI explicitly says Privacy Filter is not anonymization, not compliance certification, and not a substitute for policy review or domain-specific evaluation in high-stakes settings."""@en .

<#part-availability> a schema:WebPageElement ;
  schema:name "Availability"@en ;
  schema:position 6 ;
  schema:about <#apache-2-license> ;
  schema:text """The model is released as open weights under Apache 2.0 on Hugging Face and GitHub, with documentation on architecture, taxonomy, controls, and limitations."""@en .

<#argument-howto> a schema:HowTo ;
  schema:name "How the article builds the Privacy Filter argument"@en ;
  schema:about <#openai-privacy-filter>, <#privacy-by-design> ;
  schema:isPartOf <#article> ;
  schema:step <#step-1>, <#step-2>, <#step-3>, <#step-4> ;
  schema:description """The post moves from the practical need for context-aware privacy filtering, to the model architecture, to benchmark evidence, and finally to limitations and open release conditions."""@en .

<#step-1> a schema:HowToStep ;
  schema:name "Define the privacy filtering problem"@en ;
  schema:position 1 ;
  schema:text "The article first explains why traditional deterministic PII systems miss subtle or context-dependent personal information."@en ;
  schema:isPartOf <#argument-howto> .

<#step-2> a schema:HowToStep ;
  schema:name "Describe the model architecture"@en ;
  schema:position 2 ;
  schema:text "Privacy Filter is then presented as a token classifier with span decoding, long context support, and configurable operating points."@en ;
  schema:isPartOf <#argument-howto> .

<#step-3> a schema:HowToStep ;
  schema:name "Back the claims with benchmark and adaptation results"@en ;
  schema:position 3 ;
  schema:text "The post provides benchmark F1, precision, recall, and domain adaptation numbers to support the release."@en ;
  schema:isPartOf <#argument-howto> .

<#step-4> a schema:HowToStep ;
  schema:name "Constrain the scope and release openly"@en ;
  schema:position 4 ;
  schema:text "The article closes by emphasizing limitations, recommending human review in high-stakes domains, and releasing the model openly under Apache 2.0."@en ;
  schema:isPartOf <#argument-howto> .

<#faq-1> a schema:Question ;
  schema:name "What is OpenAI Privacy Filter?"@en ;
  schema:text "What is OpenAI Privacy Filter?"@en ;
  schema:acceptedAnswer <#faq-1-answer> ;
  schema:isPartOf <#article> .
<#faq-1-answer> a schema:Answer ;
  schema:text "It is an open-weight model for detecting and redacting personally identifiable information in text."@en ;
  schema:isPartOf <#article> .

<#faq-2> a schema:Question ;
  schema:name "Why does the article say Privacy Filter is different from traditional PII tools?"@en ;
  schema:text "Why does the article say Privacy Filter is different from traditional PII tools?"@en ;
  schema:acceptedAnswer <#faq-2-answer> ;
  schema:isPartOf <#article> .
<#faq-2-answer> a schema:Answer ;
  schema:text "Because it uses language and context understanding rather than only deterministic rules for narrow formats such as phone numbers or email addresses."@en ;
  schema:isPartOf <#article> .

<#faq-3> a schema:Question ;
  schema:name "Can Privacy Filter run locally?"@en ;
  schema:text "Can Privacy Filter run locally?"@en ;
  schema:acceptedAnswer <#faq-3-answer> ;
  schema:isPartOf <#article> .
<#faq-3-answer> a schema:Answer ;
  schema:text "Yes. The article explicitly says it can run locally so unfiltered data does not need to leave the machine for masking or redaction."@en ;
  schema:isPartOf <#article> .

<#faq-4> a schema:Question ;
  schema:name "What is the model architecture?"@en ;
  schema:text "What is the model architecture?"@en ;
  schema:acceptedAnswer <#faq-4-answer> ;
  schema:isPartOf <#article> .
<#faq-4-answer> a schema:Answer ;
  schema:text "It is a bidirectional token-classification model with constrained span decoding built from an autoregressive pretrained checkpoint."@en ;
  schema:isPartOf <#article> .

<#faq-5> a schema:Question ;
  schema:name "How large is the released model?"@en ;
  schema:text "How large is the released model?"@en ;
  schema:acceptedAnswer <#faq-5-answer> ;
  schema:isPartOf <#article> .
<#faq-5-answer> a schema:Answer ;
  schema:text "The released model has 1.5 billion total parameters with 50 million active parameters."@en ;
  schema:isPartOf <#article> .

<#faq-6> a schema:Question ;
  schema:name "What labels does the model predict?"@en ;
  schema:text "What labels does the model predict?"@en ;
  schema:acceptedAnswer <#faq-6-answer> ;
  schema:isPartOf <#article> .
<#faq-6-answer> a schema:Answer ;
  schema:text "The article lists eight categories including private person, address, email, phone, URL, date, account number, and secret."@en ;
  schema:isPartOf <#article> .

<#faq-7> a schema:Question ;
  schema:name "What benchmark results are reported?"@en ;
  schema:text "What benchmark results are reported?"@en ;
  schema:acceptedAnswer <#faq-7-answer> ;
  schema:isPartOf <#article> .
<#faq-7-answer> a schema:Answer ;
  schema:text "OpenAI reports 96% F1 on PII-Masking-300k and 97.43% F1 on a corrected version of that benchmark."@en ;
  schema:isPartOf <#article> .

<#faq-8> a schema:Question ;
  schema:name "Can the model be adapted for domain-specific tasks?"@en ;
  schema:text "Can the model be adapted for domain-specific tasks?"@en ;
  schema:acceptedAnswer <#faq-8-answer> ;
  schema:isPartOf <#article> .
<#faq-8-answer> a schema:Answer ;
  schema:text "Yes. The article says fine-tuning on even a small amount of domain-specific data can quickly improve accuracy."@en ;
  schema:isPartOf <#article> .

<#faq-9> a schema:Question ;
  schema:name "What does the article say Privacy Filter is not?"@en ;
  schema:text "What does the article say Privacy Filter is not?"@en ;
  schema:acceptedAnswer <#faq-9-answer> ;
  schema:isPartOf <#article> .
<#faq-9-answer> a schema:Answer ;
  schema:text "It is not anonymization, not compliance certification, and not a substitute for policy review or human oversight in high-stakes domains."@en ;
  schema:isPartOf <#article> .

<#faq-10> a schema:Question ;
  schema:name "How is the model being released?"@en ;
  schema:text "How is the model being released?"@en ;
  schema:acceptedAnswer <#faq-10-answer> ;
  schema:isPartOf <#article> .
<#faq-10-answer> a schema:Answer ;
  schema:text "The model is available under the Apache 2.0 license on Hugging Face and GitHub for experimentation, customization, and commercial deployment."@en ;
  schema:isPartOf <#article> .