python -m venv .venv
source .venv/bin/activate # (Windows) .venv\Scripts\activate
pip install --upgrade pip
pip install git+https://github.com/huggingface/transformers.git
pip install git+https://github.com/huggingface/diffusers.gitpip install "sglang[diffusion] @ git+https://github.com/sgl-project/sglang.git#subdirectory=python"
pip install git+https://github.com/huggingface/transformers.git
pip install git+https://github.com/huggingface/diffusers.git
sglang serve --model-path zai-org/GLM-Imageimport torch
from diffusers.pipelines.glm_image import GlmImagePipeline
pipe = GlmImagePipeline.from_pretrained(
"zai-org/GLM-Image",
torch_dtype=torch.bfloat16,
device_map="cuda",
)
prompt = "A PPT slide with clear hierarchy. Title: \"Quarterly Growth\" ..."
image = pipe(
prompt=prompt,
height=32 * 32,
width=36 * 32,
num_inference_steps=50,
guidance_scale=1.5,
generator=torch.Generator(device="cuda").manual_seed(42),
).images[0]
image.save("output_t2i.png")import torch
from PIL import Image
from diffusers.pipelines.glm_image import GlmImagePipeline
pipe = GlmImagePipeline.from_pretrained("zai-org/GLM-Image", torch_dtype=torch.bfloat16, device_map="cuda")
cond = Image.open("cond.jpg").convert("RGB")
prompt = "Replace the background of the snow forest with an underground station featuring an automatic escalator."
out = pipe(
prompt=prompt,
image=[cond],
height=33 * 32, # 입력 이미지와 같아도 꼭 지정
width=32 * 32, # 입력 이미지와 같아도 꼭 지정
num_inference_steps=50,
guidance_scale=1.5,
generator=torch.Generator(device="cuda").manual_seed(42),
).images[0]
out.save("output_i2i.png")curl --request POST \
--url https://api.z.ai/api/paas/v4/images/generations \
--header 'Authorization: Bearer <token>' \
--header 'Content-Type: application/json' \
--data '{
"model": "glm-image",
"prompt": "A poster with clear multi-line text. Title: \"GLM-Image\" ...",
"size": "1280x1280"
}'Design a modern promotional poster with a clean grid layout and strong visual hierarchy.
Top area: big bold title text: "GLM-Image Open Source"
Below title: subtitle text: "Text + Layout + Meaning preserved"
Center: a simple abstract illustration (minimal, not distracting)
Bottom area: three bullet-style blocks with icons:
"Posters with readable text"
"PPT-style slides"
"Logical infographics & diagrams"
Typography: sans-serif, high contrast, crisp edges.
Do not add any additional text beyond the quoted text.Create a 16:9 PPT slide with a professional business style.
Header: title text "Project Update"
Left column: section title "Key Metrics" and three lines:
"DAU: 1.2M"
"Retention: 38%"
"Conversion: 4.1%"
Right column: a simple bar chart illustration (no random labels).
Footer: small disclaimer text "Internal draft"
All visible text must match exactly and be enclosed in quotes. No extra text.Create an infographic with 4 numbered steps in a vertical flow.
Each step is a rounded rectangle connected by arrows.
Step titles:
"Step 1: Collect"
"Step 2: Clean"
"Step 3: Train"
"Step 4: Evaluate"
Add a small caption under each step (one short sentence).
Use a minimal flat design with clear spacing.
All text must be exact and only the quoted text should appear.Draw a clean system diagram on a white background.
Three modules as boxes:
"Input" -> "AR Generator" -> "Diffusion Decoder"
Add small labels near arrows:
"tokens"
"latent refinement"
Add a legend box in bottom-right:
"AR: global structure"
"Decoder: details & text strokes"
No additional text beyond the quoted text.import re
import torch
from difflib import SequenceMatcher
from diffusers.pipelines.glm_image import GlmImagePipeline
from paddleocr import PaddleOCR
# 1) 기대 텍스트(프롬프트에 넣은 quoted text와 동일해야 함)
EXPECTED = [
"GLM-Image Open Source",
"Text + Layout + Meaning preserved",
"Posters with readable text",
"PPT-style slides",
"Logical infographics & diagrams",
]
def normalize(s: str) -> str:
s = s.lower()
s = re.sub(r"\s+", " ", s).strip()
return s
def score_ocr(extracted: str, expected_list: list[str]) -> float:
extracted_n = normalize(extracted)
# expected 문장들이 얼마나 포함/유사한지 평균 점수
scores = []
for t in expected_list:
t_n = normalize(t)
# 포함 여부 + 유사도 혼합(간단 버전)
contain = 1.0 if t_n in extracted_n else 0.0
sim = SequenceMatcher(None, t_n, extracted_n).ratio()
scores.append(0.7 * contain + 0.3 * sim)
return sum(scores) / len(scores)
# 2) OCR 엔진
ocr = PaddleOCR(use_angle_cls=True, lang="en") # 한글이면 lang="korean" 세팅 검토
# 3) GLM-Image 로컬 파이프
pipe = GlmImagePipeline.from_pretrained(
"zai-org/GLM-Image",
torch_dtype=torch.bfloat16,
device_map="cuda",
)
prompt = """
Design a modern promotional poster with a clean grid layout and strong visual hierarchy.
Top area: big bold title text: "GLM-Image Open Source"
Below title: subtitle text: "Text + Layout + Meaning preserved"
Bottom area: three blocks with icons:
"Posters with readable text"
"PPT-style slides"
"Logical infographics & diagrams"
Typography: sans-serif, high contrast, crisp edges.
Do not add any additional text beyond the quoted text.
""".strip()
best = {"score": -1, "seed": None, "image": None}
for seed in [1, 2, 3, 4, 5]:
img = pipe(
prompt=prompt,
width=1280,
height=1280,
num_inference_steps=50,
guidance_scale=1.5,
generator=torch.Generator(device="cuda").manual_seed(seed),
).images[0]
# 4) OCR 실행
ocr_result = ocr.ocr(img, cls=True)
extracted = " ".join([line[1][0] for block in ocr_result for line in block])
# 5) 채점
s = score_ocr(extracted, EXPECTED)
if s > best["score"]:
best.update({"score": s, "seed": seed, "image": img})
print("BEST:", best["score"], "seed:", best["seed"])
best["image"].save("best_poster.png")